whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -6,6 +6,50 @@
6
6
  #include <vector>
7
7
  #include <memory>
8
8
 
9
+ // pre-tokenization types
10
+ enum llama_vocab_pre_type {
11
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
+ LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
+ LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
51
+ };
52
+
9
53
  struct LLM_KV;
10
54
  struct llama_model_loader;
11
55
 
@@ -59,6 +103,7 @@ struct llama_vocab {
59
103
  llama_token token_sep() const;
60
104
  llama_token token_nl () const;
61
105
  llama_token token_pad() const;
106
+ llama_token token_mask() const;
62
107
 
63
108
  llama_token token_prefix() const;
64
109
  llama_token token_middle() const;
@@ -25,6 +25,18 @@
25
25
  // interface implementation
26
26
  //
27
27
 
28
+ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
29
+ switch (flash_attn_type) {
30
+ case LLAMA_FLASH_ATTN_TYPE_AUTO:
31
+ return "auto";
32
+ case LLAMA_FLASH_ATTN_TYPE_DISABLED:
33
+ return "disabled";
34
+ case LLAMA_FLASH_ATTN_TYPE_ENABLED:
35
+ return "enabled";
36
+ }
37
+ GGML_ABORT("fatal error");
38
+ }
39
+
28
40
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
29
41
  struct llama_sampler_chain_params result = {
30
42
  /*.no_perf =*/ true,
@@ -47,6 +59,7 @@ bool llama_supports_mlock(void) {
47
59
 
48
60
  bool llama_supports_gpu_offload(void) {
49
61
  return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
62
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
50
63
  llama_supports_rpc();
51
64
  }
52
65
 
@@ -71,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
71
84
  GGML_ASSERT(dev && "CPU backend is not loaded");
72
85
  auto * reg = ggml_backend_dev_backend_reg(dev);
73
86
  auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
74
- numa_init_fn(numa);
87
+ if (numa_init_fn) {
88
+ numa_init_fn(numa);
89
+ }
75
90
  }
76
91
  }
77
92
 
@@ -170,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
170
185
  model->devices.push_back(*dev);
171
186
  }
172
187
  } else {
188
+ // default device selection
189
+
190
+ // build list of available devices
191
+ std::vector<ggml_backend_dev_t> gpus;
192
+ std::vector<ggml_backend_dev_t> igpus;
173
193
  std::vector<ggml_backend_dev_t> rpc_servers;
174
- // use all available devices
194
+
175
195
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
176
196
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
177
197
  switch (ggml_backend_dev_type(dev)) {
@@ -180,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
180
200
  // skip CPU backends since they are handled separately
181
201
  break;
182
202
 
183
- case GGML_BACKEND_DEVICE_TYPE_GPU:
203
+ case GGML_BACKEND_DEVICE_TYPE_GPU: {
184
204
  ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
185
205
  if (ggml_backend_reg_name(reg) == std::string("RPC")) {
186
206
  rpc_servers.push_back(dev);
187
207
  } else {
188
- model->devices.push_back(dev);
208
+ // check if there is already a GPU with the same device id
209
+ ggml_backend_dev_props props;
210
+ ggml_backend_dev_get_props(dev, &props);
211
+ auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
212
+ ggml_backend_dev_props d_props;
213
+ ggml_backend_dev_get_props(d, &d_props);
214
+ if (props.device_id && d_props.device_id) {
215
+ return strcmp(props.device_id, d_props.device_id) == 0;
216
+ }
217
+ return false;
218
+ });
219
+
220
+ if (it != gpus.end()) {
221
+ LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
222
+ __func__,
223
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
224
+ props.device_id ? props.device_id : "unknown id",
225
+ ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
226
+ } else {
227
+ gpus.push_back(dev);
228
+ }
189
229
  }
190
230
  break;
231
+ }
232
+
233
+ case GGML_BACKEND_DEVICE_TYPE_IGPU:
234
+ igpus.push_back(dev);
235
+ break;
191
236
  }
192
237
  }
193
- // add RPC servers at the front of the list
194
- if (!rpc_servers.empty()) {
195
- model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
238
+
239
+ // add RPC servers at the front of the list to minimize network transfers
240
+ model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
241
+
242
+ // add GPUs
243
+ model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
244
+
245
+ // add integrated GPUs only if no other devices were found
246
+ if (model->devices.empty()) {
247
+ model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
196
248
  }
197
249
  }
198
250
 
@@ -213,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
213
265
  }
214
266
 
215
267
  for (auto * dev : model->devices) {
216
- size_t free, total; // NOLINT
217
- ggml_backend_dev_memory(dev, &free, &total);
218
- LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
268
+ ggml_backend_dev_props props;
269
+ ggml_backend_dev_get_props(dev, &props);
270
+ LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
271
+ ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
272
+ props.device_id ? props.device_id : "unknown id",
273
+ props.memory_free/1024/1024);
219
274
  }
220
275
 
221
276
  const int status = llama_model_load(path_model, splits, *model, params);
@@ -64,59 +64,18 @@ extern "C" {
64
64
 
65
65
  typedef struct llama_memory_i * llama_memory_t;
66
66
 
67
- struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
68
-
69
67
  typedef int32_t llama_pos;
70
68
  typedef int32_t llama_token;
71
69
  typedef int32_t llama_seq_id;
72
70
 
73
71
  enum llama_vocab_type {
74
- LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
- LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
- LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
- LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
- LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
- LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
- };
81
-
82
- // pre-tokenization types
83
- enum llama_vocab_pre_type {
84
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
85
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
86
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
87
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
88
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
89
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
90
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
91
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
92
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
93
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
94
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
95
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
96
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
97
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
98
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
99
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
100
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
101
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
102
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
103
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
104
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
105
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
106
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
107
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
108
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
109
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
110
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
111
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
112
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
113
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
114
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
115
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
116
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
117
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
72
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
73
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
74
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
75
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
76
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
77
+ LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
78
+ LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
120
79
  };
121
80
 
122
81
  enum llama_rope_type {
@@ -191,6 +150,7 @@ extern "C" {
191
150
  //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
192
151
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
193
152
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
153
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
194
154
 
195
155
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
196
156
  };
@@ -219,6 +179,14 @@ extern "C" {
219
179
  LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
220
180
  };
221
181
 
182
+ enum llama_flash_attn_type {
183
+ LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
184
+ LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
185
+ LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
186
+ };
187
+
188
+ LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
189
+
222
190
  enum llama_split_mode {
223
191
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
224
192
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -238,7 +206,7 @@ extern "C" {
238
206
  llama_token_data * data;
239
207
  size_t size;
240
208
  int64_t selected; // this is the index in the data array (i.e. not the token id)
241
- bool sorted;
209
+ bool sorted; // note: do not assume the data is sorted - always check this flag
242
210
  } llama_token_data_array;
243
211
 
244
212
  typedef bool (*llama_progress_callback)(float progress, void * user_data);
@@ -323,10 +291,11 @@ extern "C" {
323
291
  const struct llama_model_kv_override * kv_overrides;
324
292
 
325
293
  // Keep the booleans together to avoid misalignment during copy-by-value.
326
- bool vocab_only; // only load the vocabulary, no weights
327
- bool use_mmap; // use mmap if possible
328
- bool use_mlock; // force system to keep model in RAM
329
- bool check_tensors; // validate model tensor data
294
+ bool vocab_only; // only load the vocabulary, no weights
295
+ bool use_mmap; // use mmap if possible
296
+ bool use_mlock; // force system to keep model in RAM
297
+ bool check_tensors; // validate model tensor data
298
+ bool use_extra_bufts; // use extra buffer types (used for weight repacking)
330
299
  };
331
300
 
332
301
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -342,6 +311,7 @@ extern "C" {
342
311
  enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
343
312
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
344
313
  enum llama_attention_type attention_type; // attention type to use for embeddings
314
+ enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
345
315
 
346
316
  // ref: https://github.com/ggml-org/llama.cpp/pull/2054
347
317
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -351,7 +321,7 @@ extern "C" {
351
321
  float yarn_beta_fast; // YaRN low correction dim
352
322
  float yarn_beta_slow; // YaRN high correction dim
353
323
  uint32_t yarn_orig_ctx; // YaRN original context size
354
- float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
324
+ float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
355
325
 
356
326
  ggml_backend_sched_eval_callback cb_eval;
357
327
  void * cb_eval_user_data;
@@ -368,12 +338,14 @@ extern "C" {
368
338
  // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
369
339
  bool embeddings; // if true, extract embeddings (together with logits)
370
340
  bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
371
- bool flash_attn; // use flash attention [EXPERIMENTAL]
372
341
  bool no_perf; // measure performance timings
373
342
  bool op_offload; // offload host tensor operations to device
374
343
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
375
344
  // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
376
345
  // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
346
+ bool kv_unified; // use a unified buffer across the input sequences when computing the attention
347
+ // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
348
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14363
377
349
  };
378
350
 
379
351
  // model quantization parameters
@@ -503,8 +475,6 @@ extern "C" {
503
475
  LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
504
476
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
505
477
 
506
- DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
507
-
508
478
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
509
479
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
510
480
 
@@ -573,6 +543,9 @@ extern "C" {
573
543
  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
574
544
  LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
575
545
 
546
+ // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
547
+ LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
548
+
576
549
  // Returns 0 on success
577
550
  LLAMA_API uint32_t llama_model_quantize(
578
551
  const char * fname_inp,
@@ -588,10 +561,32 @@ extern "C" {
588
561
  struct llama_model * model,
589
562
  const char * path_lora);
590
563
 
564
+ // Functions to access the adapter's GGUF metadata scalar values
565
+ // - The functions return the length of the string on success, or -1 on failure
566
+ // - The output string is always null-terminated and cleared on failure
567
+ // - When retrieving a string, an extra byte must be allocated to account for the null terminator
568
+ // - GGUF array values are not supported by these functions
569
+
570
+ // Get metadata value as a string by key name
571
+ LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
572
+
573
+ // Get the number of metadata key/value pairs
574
+ LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
575
+
576
+ // Get metadata key name by index
577
+ LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
578
+
579
+ // Get metadata value as a string by index
580
+ LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
581
+
591
582
  // Manually free a LoRA adapter
592
583
  // Note: loaded adapters will be free when the associated model is deleted
593
584
  LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
594
585
 
586
+ // Get the invocation tokens if the current lora is an alora
587
+ LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
588
+ LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
589
+
595
590
  // The following functions operate on a llama_context, hence the naming: llama_verb_...
596
591
 
597
592
  // Add a loaded LoRA adapter to given context
@@ -698,111 +693,6 @@ extern "C" {
698
693
  // Check if the memory supports shifting
699
694
  LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
700
695
 
701
- //
702
- // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
703
- //
704
-
705
- // Returns the number of tokens in the KV cache (slow, use only for debug)
706
- // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
707
- DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
708
- "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
709
-
710
- // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
711
- DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
712
- "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
713
-
714
- // Clear the KV cache - both cell info is erased and KV data is zeroed
715
- DEPRECATED(LLAMA_API void llama_kv_self_clear(
716
- struct llama_context * ctx),
717
- "Use llama_memory_clear() instead");
718
-
719
- // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
720
- // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
721
- // seq_id < 0 : match any sequence
722
- // p0 < 0 : [0, p1]
723
- // p1 < 0 : [p0, inf)
724
- DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
725
- struct llama_context * ctx,
726
- llama_seq_id seq_id,
727
- llama_pos p0,
728
- llama_pos p1),
729
- "Use llama_memory_seq_rm() instead");
730
-
731
- // Copy all tokens that belong to the specified sequence to another sequence
732
- // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
733
- // p0 < 0 : [0, p1]
734
- // p1 < 0 : [p0, inf)
735
- DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
736
- struct llama_context * ctx,
737
- llama_seq_id seq_id_src,
738
- llama_seq_id seq_id_dst,
739
- llama_pos p0,
740
- llama_pos p1),
741
- "Use llama_memory_seq_cp() instead");
742
-
743
- // Removes all tokens that do not belong to the specified sequence
744
- DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
745
- struct llama_context * ctx,
746
- llama_seq_id seq_id),
747
- "Use llama_memory_seq_keep() instead");
748
-
749
- // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
750
- // If the KV cache is RoPEd, the KV data is updated accordingly:
751
- // - lazily on next llama_decode()
752
- // p0 < 0 : [0, p1]
753
- // p1 < 0 : [p0, inf)
754
- DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
755
- struct llama_context * ctx,
756
- llama_seq_id seq_id,
757
- llama_pos p0,
758
- llama_pos p1,
759
- llama_pos delta),
760
- "Use llama_memory_seq_add() instead");
761
-
762
- // Integer division of the positions by factor of `d > 1`
763
- // If the KV cache is RoPEd, the KV data is updated accordingly:
764
- // - lazily on next llama_decode()
765
- // p0 < 0 : [0, p1]
766
- // p1 < 0 : [p0, inf)
767
- DEPRECATED(void llama_kv_self_seq_div(
768
- struct llama_context * ctx,
769
- llama_seq_id seq_id,
770
- llama_pos p0,
771
- llama_pos p1,
772
- int d),
773
- "Use llama_memory_seq_div() instead");
774
-
775
- // Returns the smallest position present in the KV cache for the specified sequence
776
- // This is typically non-zero only for SWA caches
777
- // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
778
- // Return -1 if the sequence is empty
779
- DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
780
- struct llama_context * ctx,
781
- llama_seq_id seq_id),
782
- "Use llama_memory_seq_pos_min() instead");
783
-
784
- // Returns the largest position present in the KV cache for the specified sequence
785
- // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
786
- // Return -1 if the sequence is empty
787
- DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
788
- struct llama_context * ctx,
789
- llama_seq_id seq_id),
790
- "Use llama_memory_seq_pos_max() instead");
791
-
792
- // Defragment the KV cache
793
- // This will be applied:
794
- // - lazily on next llama_decode()
795
- DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
796
- "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
797
-
798
- // Check if the context supports KV cache shifting
799
- DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
800
- "use llama_memory_can_shift() instead");
801
-
802
- // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
803
- DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
804
- "simply remove this call, updates are applied lazily on the next llama_decode()");
805
-
806
696
  //
807
697
  // State / sessions
808
698
  //
@@ -901,6 +791,29 @@ extern "C" {
901
791
  size_t n_token_capacity,
902
792
  size_t * n_token_count_out);
903
793
 
794
+ #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
795
+
796
+ typedef uint32_t llama_state_seq_flags;
797
+
798
+ LLAMA_API size_t llama_state_seq_get_size_ext(
799
+ struct llama_context * ctx,
800
+ llama_seq_id seq_id,
801
+ llama_state_seq_flags flags);
802
+
803
+ LLAMA_API size_t llama_state_seq_get_data_ext(
804
+ struct llama_context * ctx,
805
+ uint8_t * dst,
806
+ size_t size,
807
+ llama_seq_id seq_id,
808
+ llama_state_seq_flags flags);
809
+
810
+ LLAMA_API size_t llama_state_seq_set_data_ext(
811
+ struct llama_context * ctx,
812
+ const uint8_t * src,
813
+ size_t size,
814
+ llama_seq_id dest_seq_id,
815
+ llama_state_seq_flags flags);
816
+
904
817
  //
905
818
  // Decoding
906
819
  //
@@ -992,6 +905,7 @@ extern "C" {
992
905
  // in the order they have appeared in the batch.
993
906
  // Rows: number of tokens for which llama_batch.logits[i] != 0
994
907
  // Cols: n_vocab
908
+ // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
995
909
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
996
910
 
997
911
  // Logits for the ith token. For positive indices, Equivalent to:
@@ -1006,6 +920,7 @@ extern "C" {
1006
920
  // in the order they have appeared in the batch.
1007
921
  // shape: [n_outputs*n_embd]
1008
922
  // Otherwise, returns NULL.
923
+ // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
1009
924
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
1010
925
 
1011
926
  // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -1044,6 +959,7 @@ extern "C" {
1044
959
  LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1045
960
  LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1046
961
  LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
962
+ LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1047
963
 
1048
964
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1049
965
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -1244,11 +1160,6 @@ extern "C" {
1244
1160
  LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1245
1161
  LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1246
1162
 
1247
- /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1248
- /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1249
- DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1250
- "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1251
-
1252
1163
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1253
1164
  /// Setting k <= 0 makes this a noop
1254
1165
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1418,23 +1329,25 @@ extern "C" {
1418
1329
  //
1419
1330
  // Performance utils
1420
1331
  //
1421
- // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
1332
+ // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
1422
1333
  //
1423
1334
 
1424
1335
  struct llama_perf_context_data {
1425
- double t_start_ms;
1426
- double t_load_ms;
1427
- double t_p_eval_ms;
1428
- double t_eval_ms;
1429
-
1430
- int32_t n_p_eval;
1431
- int32_t n_eval;
1336
+ // ms == milliseconds
1337
+ double t_start_ms; // absolute start time
1338
+ double t_load_ms; // time needed for loading the model
1339
+ double t_p_eval_ms; // time needed for processing the prompt
1340
+ double t_eval_ms; // time needed for generating tokens
1341
+
1342
+ int32_t n_p_eval; // number of prompt tokens
1343
+ int32_t n_eval; // number of generated tokens
1344
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1432
1345
  };
1433
1346
 
1434
1347
  struct llama_perf_sampler_data {
1435
- double t_sample_ms;
1348
+ double t_sample_ms; // time needed for sampling in ms
1436
1349
 
1437
- int32_t n_sample;
1350
+ int32_t n_sample; // number of sampled tokens
1438
1351
  };
1439
1352
 
1440
1353
  LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
@@ -1446,6 +1359,9 @@ extern "C" {
1446
1359
  LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1447
1360
  LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1448
1361
 
1362
+ // print a breakdown of per-device memory use via LLAMA_LOG:
1363
+ LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
1364
+
1449
1365
  //
1450
1366
  // training
1451
1367
  //
@@ -1464,6 +1380,8 @@ extern "C" {
1464
1380
 
1465
1381
  ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1466
1382
  void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1383
+
1384
+ enum ggml_opt_optimizer_type optimizer_type;
1467
1385
  };
1468
1386
 
1469
1387
  LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
@@ -66,7 +66,7 @@ struct whisper_params {
66
66
  float top_p = 0.80f;
67
67
  float min_p = 0.01f;
68
68
  float temp = 0.30f;
69
-
69
+
70
70
  float vad_thold = 0.6f;
71
71
  float freq_thold = 100.0f;
72
72
 
@@ -76,7 +76,7 @@ struct whisper_params {
76
76
  bool no_timestamps = true;
77
77
  bool verbose_prompt = false;
78
78
  bool use_gpu = true;
79
- bool flash_attn = false;
79
+ bool flash_attn = true;
80
80
 
81
81
  std::string person = "Georgi";
82
82
  std::string bot_name = "LLaMA";
@@ -122,6 +122,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
122
122
  else if (arg == "-vp" || arg == "--verbose-prompt") { params.verbose_prompt = true; }
123
123
  else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
124
124
  else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
125
+ else if (arg == "-nfa" || arg == "--no-flash-attn") { params.flash_attn = false; }
125
126
  else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
126
127
  else if (arg == "-bn" || arg == "--bot-name") { params.bot_name = argv[++i]; }
127
128
  else if (arg == "--session") { params.path_session = argv[++i]; }
@@ -175,7 +176,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
175
176
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
176
177
  fprintf(stderr, " -vp, --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false");
177
178
  fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
178
- fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
179
+ fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
180
+ fprintf(stderr, " -nfa, --no-flash-attn [%-7s] disable flash attention\n", params.flash_attn ? "false" : "true");
179
181
  fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
180
182
  fprintf(stderr, " -bn NAME, --bot-name NAME [%-7s] bot name (to display)\n", params.bot_name.c_str());
181
183
  fprintf(stderr, " -w TEXT, --wake-command T [%-7s] wake-up command to listen for\n", params.wake_cmd.c_str());
@@ -340,9 +342,10 @@ int main(int argc, char ** argv) {
340
342
  llama_context_params lcparams = llama_context_default_params();
341
343
 
342
344
  // tune these to your liking
343
- lcparams.n_ctx = 2048;
344
- lcparams.n_threads = params.n_threads;
345
- lcparams.flash_attn = params.flash_attn;
345
+ lcparams.n_ctx = 2048;
346
+ lcparams.n_threads = params.n_threads;
347
+
348
+ lcparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
346
349
 
347
350
  struct llama_context * ctx_llama = llama_init_from_model(model_llama, lcparams);
348
351