whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -1,1194 +0,0 @@
1
- #include "llama.h"
2
-
3
- #include "ggml-cpp.h"
4
- #include "llama-impl.h"
5
-
6
- #include "llama-chat.h"
7
- #include "llama-context.h"
8
- #include "llama-mmap.h"
9
- #include "llama-vocab.h"
10
- #include "llama-model-loader.h"
11
- #include "llama-model-saver.h"
12
- #include "llama-model.h"
13
-
14
- #include "ggml.h"
15
- #include "ggml-backend.h"
16
- #include "gguf.h"
17
-
18
- #include <algorithm>
19
- #include <cassert>
20
- #include <cinttypes>
21
- #include <cstddef>
22
- #include <cstdint>
23
- #include <cstdio>
24
- #include <cstring>
25
- #include <ctime>
26
- #include <stdexcept>
27
-
28
- #if defined(_MSC_VER)
29
- #pragma warning(disable: 4244 4267) // possible loss of data
30
- #endif
31
-
32
- //
33
- // interface implementation
34
- //
35
-
36
- const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
37
- switch (flash_attn_type) {
38
- case LLAMA_FLASH_ATTN_TYPE_AUTO:
39
- return "auto";
40
- case LLAMA_FLASH_ATTN_TYPE_DISABLED:
41
- return "disabled";
42
- case LLAMA_FLASH_ATTN_TYPE_ENABLED:
43
- return "enabled";
44
- }
45
- GGML_ABORT("fatal error");
46
- }
47
-
48
- struct llama_device_memory_data {
49
- int64_t total;
50
- int64_t free;
51
- llama_memory_breakdown_data mb;
52
- };
53
-
54
- static std::vector<llama_device_memory_data> llama_get_device_memory_data(
55
- const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
56
- std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
57
- const ggml_log_level log_level) {
58
- struct user_data_t {
59
- struct {
60
- ggml_log_callback callback;
61
- void * user_data;
62
- } original_logger;
63
- ggml_log_level min_level; // prints below this log level go to debug log
64
- };
65
- user_data_t ud;
66
- llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
67
- ud.min_level = log_level;
68
-
69
- llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
70
- const user_data_t * ud = (const user_data_t *) user_data;
71
- const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
72
- ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
73
- }, &ud);
74
-
75
- llama_model_params mparams_copy = *mparams;
76
- mparams_copy.no_alloc = true;
77
- mparams_copy.use_mmap = false;
78
- mparams_copy.use_mlock = false;
79
-
80
- llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
81
- if (model == nullptr) {
82
- llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
83
- throw std::runtime_error("failed to load model");
84
- }
85
-
86
- llama_context * ctx = llama_init_from_model(model, *cparams);
87
- if (ctx == nullptr) {
88
- llama_model_free(model);
89
- llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
90
- throw std::runtime_error("failed to create llama_context from model");
91
- }
92
-
93
- std::vector<llama_device_memory_data> ret(model->devices.size());
94
-
95
- std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
96
-
97
- for (const auto & [buft, mb] : memory_breakdown) {
98
- if (ggml_backend_buft_is_host(buft)) {
99
- continue;
100
- }
101
-
102
- ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
103
- if (!dev) {
104
- continue;
105
- }
106
- for (size_t i = 0; i < ret.size(); i++) {
107
- if (model->devices[i] == dev) {
108
- ret[i].mb.model += mb.model;
109
- ret[i].mb.context += mb.context;
110
- ret[i].mb.compute += mb.compute;
111
- break;
112
- }
113
- }
114
- }
115
- for (size_t i = 0; i < ret.size(); i++) {
116
- size_t free;
117
- size_t total;
118
- ggml_backend_dev_memory(model->devices[i], &free, &total);
119
-
120
- // devices can return 0 bytes for free and total memory if they do not
121
- // have any to report. in this case, we will use the host memory as a fallback
122
- // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
123
- if (free == 0 && total == 0) {
124
- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
125
- if (cpu_dev == nullptr) {
126
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
127
- }
128
- ggml_backend_dev_memory(cpu_dev, &free, &total);
129
- }
130
- ret[i].free = free;
131
- ret[i].total = total;
132
- }
133
-
134
- devs = model->devices;
135
- hp_ngl = model->hparams.n_layer;
136
- hp_n_ctx_train = model->hparams.n_ctx_train;
137
- hp_n_expert = model->hparams.n_expert;
138
-
139
- llama_memory_breakdown_print(ctx); // goes to debug log
140
-
141
- llama_free(ctx);
142
- llama_model_free(model);
143
- llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
144
- return ret;
145
- }
146
-
147
- // enum to identify part of a layer for distributing its tensors:
148
- enum layer_fraction_t {
149
- LAYER_FRACTION_NONE = 0, // nothing
150
- LAYER_FRACTION_ATTN = 1, // attention
151
- LAYER_FRACTION_UP = 2, // attention + up
152
- LAYER_FRACTION_GATE = 3, // attention + up + gate
153
- LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
154
- };
155
- // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
156
-
157
- class llama_params_fit_exception : public std::runtime_error {
158
- using std::runtime_error::runtime_error;
159
- };
160
-
161
- static void llama_params_fit_impl(
162
- const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
163
- float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
164
- size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
165
- constexpr int64_t MiB = 1024*1024;
166
- typedef std::vector<llama_device_memory_data> dmds_t;
167
- const llama_model_params default_mparams = llama_model_default_params();
168
-
169
- std::vector<ggml_backend_dev_t> devs;
170
- uint32_t hp_ngl = 0; // hparams.n_gpu_layers
171
- uint32_t hp_nct = 0; // hparams.n_ctx_train
172
- uint32_t hp_nex = 0; // hparams.n_expert
173
-
174
- // step 1: get data for default parameters and check whether any changes are necessary in the first place
175
-
176
- LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
177
- const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
178
- const size_t nd = devs.size(); // number of devices
179
- if (nd == 0) {
180
- LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
181
- return;
182
- }
183
-
184
- std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
185
- margins.reserve(nd);
186
- for (size_t id = 0; id < nd; id++) {
187
- margins.push_back(margins_s[id]);
188
- }
189
-
190
- std::vector<std::string> dev_names;
191
- {
192
- dev_names.reserve(nd);
193
- size_t max_length = 0;
194
- for (ggml_backend_dev_t dev : devs) {
195
- std::string name = ggml_backend_dev_name(dev);
196
- name += " (";
197
- name += ggml_backend_dev_description(dev);
198
- name += ")";
199
- dev_names.push_back(name);
200
- max_length = std::max(max_length, name.length());
201
- }
202
- for (std::string & dn : dev_names) {
203
- dn.insert(dn.end(), max_length - dn.length(), ' ');
204
- }
205
- }
206
-
207
- int64_t sum_free = 0;
208
- int64_t sum_projected_free = 0;
209
- int64_t sum_projected_used = 0;
210
- int64_t sum_projected_model = 0;
211
- std::vector<int64_t> projected_free_per_device;
212
- projected_free_per_device.reserve(nd);
213
-
214
- if (nd > 1) {
215
- LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
216
- }
217
- for (size_t id = 0; id < nd; id++) {
218
- const llama_device_memory_data & dmd = dmds_full[id];
219
-
220
- const int64_t projected_used = dmd.mb.total();
221
- const int64_t projected_free = dmd.free - projected_used;
222
- projected_free_per_device.push_back(projected_free);
223
-
224
- sum_free += dmd.free;
225
- sum_projected_used += projected_used;
226
- sum_projected_free += projected_free;
227
- sum_projected_model += dmd.mb.model;
228
-
229
- if (nd > 1) {
230
- LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
231
- __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
232
- }
233
- }
234
- assert(sum_free >= 0 && sum_projected_used >= 0);
235
- LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
236
- __func__, sum_projected_used/MiB, sum_free/MiB);
237
- if (nd == 1) {
238
- if (projected_free_per_device[0] >= margins[0]) {
239
- LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
240
- __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
241
- return;
242
- }
243
- } else {
244
- bool changes_needed = false;
245
- for (size_t id = 0; id < nd; id++) {
246
- if (projected_free_per_device[id] < margins[id]) {
247
- changes_needed = true;
248
- break;
249
- }
250
- }
251
- if (!changes_needed) {
252
- LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
253
- return;
254
- }
255
- }
256
-
257
- // step 2: try reducing memory use by reducing the context size
258
-
259
- {
260
- int64_t global_surplus = sum_projected_free;
261
- for (size_t id = 0; id < nd; id++) {
262
- global_surplus -= margins[id];
263
- }
264
- if (global_surplus < 0) {
265
- if (nd == 1) {
266
- LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
267
- __func__, margins[0]/MiB, -global_surplus/MiB);
268
- } else {
269
- LLAMA_LOG_INFO(
270
- "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
271
- __func__, -global_surplus/MiB);
272
- }
273
- if (cparams->n_ctx == 0) {
274
- if (hp_nct > n_ctx_min) {
275
- int64_t sum_used_target = sum_free;
276
- for (size_t id = 0; id < nd; id++) {
277
- sum_used_target -= margins[id];
278
- }
279
- if (nd > 1) {
280
- // for multiple devices we need to be more conservative in terms of how much context we think can fit:
281
- // - for dense models only whole layers can be assigned to devices
282
- // - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
283
- // - on average we expect a waste of 0.5 layers/tensors per device
284
- // - use slightly more than the expected average for nd devices to be safe
285
- const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
286
- sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
287
- }
288
-
289
- int64_t sum_projected_used_min_ctx = 0;
290
- cparams->n_ctx = n_ctx_min;
291
- const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
292
- for (const auto & dmd : dmds_min_ctx) {
293
- sum_projected_used_min_ctx += dmd.mb.total();
294
- }
295
- if (sum_used_target > sum_projected_used_min_ctx) {
296
- // linear interpolation between minimum and maximum context size:
297
- cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
298
- / (sum_projected_used - sum_projected_used_min_ctx);
299
- cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
300
-
301
- const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
302
- const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
303
- LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
304
- __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
305
- if (nd == 1) {
306
- LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
307
- return;
308
- }
309
- LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
310
- } else {
311
- const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
312
- LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
313
- __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
314
- }
315
- } else {
316
- if (n_ctx_min == UINT32_MAX) {
317
- LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
318
- } else {
319
- LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
320
- __func__, hp_nct, n_ctx_min);
321
- }
322
- }
323
- } else {
324
- LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
325
- }
326
- }
327
- }
328
-
329
- if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
330
- throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
331
- }
332
- if (nd > 1) {
333
- if (!tensor_split) {
334
- throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
335
- }
336
- if (mparams->tensor_split) {
337
- for (size_t id = 0; id < nd; id++) {
338
- if (mparams->tensor_split[id] != 0.0f) {
339
- throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
340
- }
341
- }
342
- }
343
- if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
344
- throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
345
- }
346
- }
347
- if (!tensor_buft_overrides) {
348
- throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
349
- }
350
- if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
351
- throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
352
- }
353
-
354
- // step 3: iteratively fill the back to front with "dense" layers
355
- // - for a dense model simply fill full layers, giving each device a contiguous slice of the model
356
- // - for a MoE model, same as dense model but with all MoE tensors in system memory
357
-
358
- // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
359
- auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
360
- constexpr size_t n_strings = 1000;
361
- if (il >= n_strings) {
362
- throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
363
- }
364
- switch (lf) {
365
- case LAYER_FRACTION_ATTN: {
366
- static std::array<std::string, n_strings> patterns;
367
- if (patterns[il].empty()) {
368
- patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
369
- }
370
- return patterns[il].c_str();
371
- }
372
- case LAYER_FRACTION_UP: {
373
- static std::array<std::string, n_strings> patterns;
374
- if (patterns[il].empty()) {
375
- patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
376
- }
377
- return patterns[il].c_str();
378
- }
379
- case LAYER_FRACTION_GATE: {
380
- static std::array<std::string, n_strings> patterns;
381
- if (patterns[il].empty()) {
382
- patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
383
- }
384
- return patterns[il].c_str();
385
- }
386
- case LAYER_FRACTION_MOE: {
387
- static std::array<std::string, n_strings> patterns;
388
- if (patterns[il].empty()) {
389
- patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
390
- }
391
- return patterns[il].c_str();
392
- }
393
- default:
394
- GGML_ABORT("fatal error");
395
- }
396
- };
397
-
398
- struct ngl_t {
399
- uint32_t n_layer = 0; // number of total layers
400
- uint32_t n_part = 0; // number of partial layers, <= n_layer
401
-
402
- // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
403
- layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
404
-
405
- uint32_t n_full() const {
406
- assert(n_layer >= n_part);
407
- return n_layer - n_part;
408
- }
409
- };
410
-
411
- const size_t ntbo = llama_max_tensor_buft_overrides();
412
-
413
- // utility function to set n_gpu_layers and tensor_split
414
- auto set_ngl_tensor_split_tbo = [&](
415
- const std::vector<ngl_t> & ngl_per_device,
416
- const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
417
- llama_model_params & mparams) {
418
- mparams.n_gpu_layers = 0;
419
- for (size_t id = 0; id < nd; id++) {
420
- mparams.n_gpu_layers += ngl_per_device[id].n_layer;
421
- if (nd > 1) {
422
- tensor_split[id] = ngl_per_device[id].n_layer;
423
- }
424
- }
425
- assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
426
- uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
427
-
428
- mparams.tensor_split = tensor_split;
429
-
430
- size_t itbo = 0;
431
- for (size_t id = 0; id < nd; id++) {
432
- il0 += ngl_per_device[id].n_full();
433
- for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
434
- if (itbo + 1 >= ntbo) {
435
- tensor_buft_overrides[itbo].pattern = nullptr;
436
- tensor_buft_overrides[itbo].buft = nullptr;
437
- itbo++;
438
- mparams.tensor_buft_overrides = tensor_buft_overrides;
439
- throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
440
- + std::to_string(ntbo) + " is insufficient for model");
441
- }
442
- tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
443
- tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
444
- itbo++;
445
- }
446
- il0 += ngl_per_device[id].n_part;
447
- }
448
- tensor_buft_overrides[itbo].pattern = nullptr;
449
- tensor_buft_overrides[itbo].buft = nullptr;
450
- itbo++;
451
- mparams.tensor_buft_overrides = tensor_buft_overrides;
452
- };
453
-
454
- // utility function that returns the memory use per device for given numbers of layers per device
455
- auto get_memory_for_layers = [&](
456
- const char * func_name,
457
- const std::vector<ngl_t> & ngl_per_device,
458
- const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
459
- llama_model_params mparams_copy = *mparams;
460
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
461
-
462
- const dmds_t dmd_nl = llama_get_device_memory_data(
463
- path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
464
-
465
- LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
466
- for (size_t id = 0; id < nd; id++) {
467
- const ngl_t & n = ngl_per_device[id];
468
- LLAMA_LOG_DEBUG(
469
- "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
470
- func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
471
- }
472
-
473
- std::vector<int64_t> ret;
474
- ret.reserve(nd);
475
- for (const llama_device_memory_data & dmd : dmd_nl) {
476
- ret.push_back(dmd.mb.total());
477
- }
478
- return ret;
479
- };
480
-
481
- int64_t global_surplus_cpu_moe = 0;
482
- if (hp_nex > 0) {
483
- const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
484
- ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
485
- tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
486
- tensor_buft_overrides[1] = {nullptr, nullptr};
487
- mparams->tensor_buft_overrides = tensor_buft_overrides;
488
-
489
- LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
490
- const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
491
- path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
492
-
493
- for (size_t id = 0; id < nd; id++) {
494
- global_surplus_cpu_moe += dmds_cpu_moe[id].free;
495
- global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
496
- }
497
-
498
- if (global_surplus_cpu_moe > 0) {
499
- LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
500
- __func__, global_surplus_cpu_moe/MiB);
501
- } else {
502
- LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
503
- __func__, -global_surplus_cpu_moe/MiB);
504
- }
505
-
506
- // reset
507
- tensor_buft_overrides[0] = {nullptr, nullptr};
508
- mparams->tensor_buft_overrides = tensor_buft_overrides;
509
- }
510
-
511
- std::vector<int64_t> targets; // maximum acceptable memory use per device
512
- targets.reserve(nd);
513
- for (size_t id = 0; id < nd; id++) {
514
- targets.push_back(dmds_full[id].free - margins[id]);
515
- LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
516
- }
517
-
518
- std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
519
- overflow_bufts.reserve(nd);
520
- for (size_t id = 0; id < nd; id++) {
521
- overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
522
- }
523
-
524
- std::vector<ngl_t> ngl_per_device(nd);
525
- std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
526
-
527
- // optimize the number of layers per device using the method of false position:
528
- // - ngl_per_device has 0 layers for each device, lower bound
529
- // - try a "high" configuration where a device is given all unassigned layers
530
- // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
531
- // - check memory use of our guess, replace either the low or high bound
532
- // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
533
- // - the last device has the output layer, which cannot be a partial layer
534
- if (hp_nex == 0) {
535
- LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
536
- } else {
537
- LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
538
- }
539
- for (int id = nd - 1; id >= 0; id--) {
540
- uint32_t n_unassigned = hp_ngl + 1;
541
- for (size_t jd = id + 1; jd < nd; ++jd) {
542
- assert(n_unassigned >= ngl_per_device[jd].n_layer);
543
- n_unassigned -= ngl_per_device[jd].n_layer;
544
- }
545
-
546
- std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
547
- ngl_per_device_high[id].n_layer = n_unassigned;
548
- if (hp_nex > 0) {
549
- ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
550
- }
551
- if (ngl_per_device_high[id].n_layer > 0) {
552
- std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
553
- if (mem_high[id] > targets[id]) {
554
- assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
555
- uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
556
- LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
557
- while (delta > 1) {
558
- uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
559
- step_size = std::max(step_size, uint32_t(1));
560
- step_size = std::min(step_size, delta - 1);
561
-
562
- std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
563
- ngl_per_device_test[id].n_layer += step_size;
564
- if (hp_nex) {
565
- ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
566
- step_size - 1 : step_size; // the first layer is the output layer which must always be full
567
- }
568
- const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
569
-
570
- if (mem_test[id] <= targets[id]) {
571
- ngl_per_device = ngl_per_device_test;
572
- mem = mem_test;
573
- LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
574
- } else {
575
- ngl_per_device_high = ngl_per_device_test;
576
- mem_high = mem_test;
577
- LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
578
- }
579
- delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
580
- }
581
- } else {
582
- assert(ngl_per_device_high[id].n_layer == n_unassigned);
583
- ngl_per_device = ngl_per_device_high;
584
- mem = mem_high;
585
- LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
586
- }
587
- }
588
-
589
- const int64_t projected_margin = dmds_full[id].free - mem[id];
590
- LLAMA_LOG_INFO(
591
- "%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
592
- __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
593
- }
594
- if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
595
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
596
- return;
597
- }
598
-
599
- // step 4: for a MoE model where all dense tensors fit,
600
- // convert the dense-only layers in the back to full layers in the front until all devices are full
601
- // essentially the same procedure as for the dense-only layers except front-to-back
602
- // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
603
-
604
- size_t id_dense_start = nd;
605
- for (int id = nd - 1; id >= 0; id--) {
606
- if (ngl_per_device[id].n_layer > 0) {
607
- id_dense_start = id;
608
- continue;
609
- }
610
- break;
611
- }
612
- assert(id_dense_start < nd);
613
-
614
- LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
615
- for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
616
- std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
617
- for (size_t jd = id_dense_start; jd < nd; jd++) {
618
- const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
619
- ngl_per_device_high[id].n_layer += n_layer_move;
620
- ngl_per_device_high[jd].n_layer -= n_layer_move;
621
- ngl_per_device_high[jd].n_part = 0;
622
- }
623
- size_t id_dense_start_high = nd - 1;
624
- std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
625
-
626
- if (mem_high[id] > targets[id]) {
627
- assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
628
- uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
629
- while (delta > 1) {
630
- uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
631
- step_size = std::max(step_size, uint32_t(1));
632
- step_size = std::min(step_size, delta - 1);
633
-
634
- std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
635
- size_t id_dense_start_test = id_dense_start;
636
- uint32_t n_converted_test = 0;
637
- for (;id_dense_start_test < nd; id_dense_start_test++) {
638
- const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
639
- ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
640
- ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
641
- ngl_per_device_test[id].n_layer += n_convert_jd;
642
- n_converted_test += n_convert_jd;
643
-
644
- if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
645
- break;
646
- }
647
- }
648
- const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
649
-
650
- if (mem_test[id] <= targets[id]) {
651
- ngl_per_device = ngl_per_device_test;
652
- mem = mem_test;
653
- id_dense_start = id_dense_start_test;
654
- LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
655
- __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
656
- } else {
657
- ngl_per_device_high = ngl_per_device_test;
658
- mem_high = mem_test;
659
- id_dense_start_high = id_dense_start_test;
660
- LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
661
- __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
662
- }
663
- assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
664
- delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
665
- }
666
- } else {
667
- ngl_per_device = ngl_per_device_high;
668
- mem = mem_high;
669
- id_dense_start = id_dense_start_high;
670
- LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
671
- __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
672
- }
673
-
674
- // try to fit at least part of one more layer
675
- if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
676
- std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
677
- size_t id_dense_start_test = id_dense_start;
678
- ngl_per_device_test[id_dense_start_test].n_layer--;
679
- ngl_per_device_test[id_dense_start_test].n_part--;
680
- ngl_per_device_test[id].n_layer++;
681
- ngl_per_device_test[id].n_part++;
682
- if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
683
- id_dense_start_test++;
684
- }
685
- ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
686
- std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
687
- if (id < nd - 1) {
688
- overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
689
- }
690
- LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
691
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
692
- if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
693
- ngl_per_device = ngl_per_device_test;
694
- overflow_bufts = overflow_bufts_test;
695
- mem = mem_test;
696
- id_dense_start = id_dense_start_test;
697
- LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
698
- __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
699
-
700
- ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
701
- LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
702
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
703
- if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
704
- ngl_per_device = ngl_per_device_test;
705
- overflow_bufts = overflow_bufts_test;
706
- mem = mem_test;
707
- id_dense_start = id_dense_start_test;
708
- LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
709
- __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
710
- }
711
- } else {
712
- ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
713
- LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
714
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
715
- if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
716
- ngl_per_device = ngl_per_device_test;
717
- overflow_bufts = overflow_bufts_test;
718
- mem = mem_test;
719
- id_dense_start = id_dense_start_test;
720
- LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
721
- __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
722
- }
723
- }
724
- }
725
-
726
- const int64_t projected_margin = dmds_full[id].free - mem[id];
727
- LLAMA_LOG_INFO(
728
- "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
729
- __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
730
- }
731
-
732
- // print info for devices that were not changed during the conversion from dense only to full layers:
733
- for (size_t id = id_dense_start + 1; id < nd; id++) {
734
- const int64_t projected_margin = dmds_full[id].free - mem[id];
735
- LLAMA_LOG_INFO(
736
- "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
737
- __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
738
- }
739
-
740
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
741
- }
742
-
743
- enum llama_params_fit_status llama_params_fit(
744
- const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
745
- float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
746
- size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
747
- const int64_t t0_us = llama_time_us();
748
- llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
749
- try {
750
- llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
751
- LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
752
- } catch (const llama_params_fit_exception & e) {
753
- LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
754
- status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
755
- } catch (const std::runtime_error & e) {
756
- LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
757
- status = LLAMA_PARAMS_FIT_STATUS_ERROR;
758
- }
759
- const int64_t t1_us = llama_time_us();
760
- LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
761
- return status;
762
- }
763
-
764
- struct llama_sampler_chain_params llama_sampler_chain_default_params() {
765
- struct llama_sampler_chain_params result = {
766
- /*.no_perf =*/ true,
767
- };
768
-
769
- return result;
770
- }
771
-
772
- size_t llama_max_devices(void) {
773
- return 16;
774
- }
775
-
776
- size_t llama_max_tensor_buft_overrides() {
777
- return 4096;
778
- }
779
-
780
- bool llama_supports_mmap(void) {
781
- return llama_mmap::SUPPORTED;
782
- }
783
-
784
- bool llama_supports_mlock(void) {
785
- return llama_mlock::SUPPORTED;
786
- }
787
-
788
- bool llama_supports_gpu_offload(void) {
789
- return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
790
- ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
791
- llama_supports_rpc();
792
- }
793
-
794
- bool llama_supports_rpc(void) {
795
- return ggml_backend_reg_by_name("RPC") != nullptr;
796
- }
797
-
798
- void llama_backend_init(void) {
799
- ggml_time_init();
800
-
801
- // needed to initialize f16 tables
802
- {
803
- struct ggml_init_params params = { 0, NULL, false };
804
- struct ggml_context * ctx = ggml_init(params);
805
- ggml_free(ctx);
806
- }
807
- }
808
-
809
- void llama_numa_init(enum ggml_numa_strategy numa) {
810
- if (numa != GGML_NUMA_STRATEGY_DISABLED) {
811
- auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
812
- GGML_ASSERT(dev && "CPU backend is not loaded");
813
- auto * reg = ggml_backend_dev_backend_reg(dev);
814
- auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
815
- if (numa_init_fn) {
816
- numa_init_fn(numa);
817
- }
818
- }
819
- }
820
-
821
- void llama_backend_free(void) {
822
- ggml_quantize_free();
823
- }
824
-
825
- int64_t llama_time_us(void) {
826
- return ggml_time_us();
827
- }
828
-
829
- // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
830
- static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
831
- const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
832
- // loading time will be recalculated after the first eval, so
833
- // we take page faults deferred by mmap() into consideration
834
- model.t_load_us = 0;
835
- time_meas tm(model.t_load_us);
836
-
837
- model.t_start_us = tm.t_start_us;
838
-
839
- try {
840
- llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
841
- params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
842
-
843
- ml.print_info();
844
-
845
- model.hparams.vocab_only = params.vocab_only;
846
- model.hparams.no_alloc = params.no_alloc;
847
-
848
- try {
849
- model.load_arch(ml);
850
- } catch(const std::exception & e) {
851
- throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
852
- }
853
- try {
854
- model.load_hparams(ml);
855
- } catch(const std::exception & e) {
856
- throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
857
- }
858
- if (model.arch == LLM_ARCH_CLIP) {
859
- throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
860
- }
861
- try {
862
- model.load_vocab(ml);
863
- } catch(const std::exception & e) {
864
- throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
865
- }
866
-
867
- model.load_stats(ml);
868
- model.print_info();
869
-
870
- if (params.vocab_only) {
871
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
872
- return 0;
873
- }
874
-
875
- if (!model.load_tensors(ml)) {
876
- return -2;
877
- }
878
- } catch (const std::exception & err) {
879
- LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
880
- return -1;
881
- }
882
-
883
- return 0;
884
- }
885
-
886
- static struct llama_model * llama_model_load_from_file_impl(
887
- struct gguf_context * metadata,
888
- llama_model_set_tensor_data_t set_tensor_data,
889
- void * set_tensor_data_ud,
890
- const std::string & path_model,
891
- std::vector<std::string> & splits,
892
- struct llama_model_params params) {
893
- GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
894
- ggml_time_init();
895
-
896
- if (!params.vocab_only && ggml_backend_reg_count() == 0) {
897
- LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
898
- return nullptr;
899
- }
900
-
901
- unsigned cur_percentage = 0;
902
- if (params.progress_callback == NULL) {
903
- params.progress_callback_user_data = &cur_percentage;
904
- params.progress_callback = [](float progress, void * ctx) {
905
- unsigned * cur_percentage_p = (unsigned *) ctx;
906
- unsigned percentage = (unsigned) (100 * progress);
907
- while (percentage > *cur_percentage_p) {
908
- *cur_percentage_p = percentage;
909
- LLAMA_LOG_CONT(".");
910
- if (percentage >= 100) {
911
- LLAMA_LOG_CONT("\n");
912
- }
913
- }
914
- return true;
915
- };
916
- }
917
-
918
- llama_model * model = new llama_model(params);
919
-
920
- // create list of devices to use with this model
921
- if (params.devices) {
922
- for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
923
- model->devices.push_back(*dev);
924
- }
925
- } else {
926
- // default device selection
927
-
928
- // build list of available devices
929
- std::vector<ggml_backend_dev_t> gpus;
930
- std::vector<ggml_backend_dev_t> igpus;
931
- std::vector<ggml_backend_dev_t> rpc_servers;
932
-
933
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
934
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
935
- switch (ggml_backend_dev_type(dev)) {
936
- case GGML_BACKEND_DEVICE_TYPE_CPU:
937
- case GGML_BACKEND_DEVICE_TYPE_ACCEL:
938
- // skip CPU backends since they are handled separately
939
- break;
940
-
941
- case GGML_BACKEND_DEVICE_TYPE_GPU: {
942
- ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
943
- if (ggml_backend_reg_name(reg) == std::string("RPC")) {
944
- rpc_servers.push_back(dev);
945
- } else {
946
- // check if there is already a GPU with the same device id
947
- ggml_backend_dev_props props;
948
- ggml_backend_dev_get_props(dev, &props);
949
- auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
950
- ggml_backend_dev_props d_props;
951
- ggml_backend_dev_get_props(d, &d_props);
952
- if (props.device_id && d_props.device_id) {
953
- return strcmp(props.device_id, d_props.device_id) == 0;
954
- }
955
- return false;
956
- });
957
-
958
- if (it != gpus.end()) {
959
- LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
960
- __func__,
961
- ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
962
- props.device_id ? props.device_id : "unknown id",
963
- ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
964
- } else {
965
- gpus.push_back(dev);
966
- }
967
- }
968
- break;
969
- }
970
-
971
- case GGML_BACKEND_DEVICE_TYPE_IGPU:
972
- igpus.push_back(dev);
973
- break;
974
- }
975
- }
976
-
977
- // add RPC servers at the front of the list to minimize network transfers
978
- model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
979
-
980
- // add GPUs
981
- model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
982
-
983
- // add integrated GPUs only if no other devices were found
984
- if (model->devices.empty()) {
985
- model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
986
- }
987
- }
988
-
989
- // if using single GPU mode, remove all except the main GPU
990
- if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
991
- if (params.main_gpu < 0) {
992
- model->devices.clear();
993
- } else {
994
- if (params.main_gpu >= (int)model->devices.size()) {
995
- LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
996
- llama_model_free(model);
997
- return nullptr;
998
- }
999
- ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
1000
- model->devices.clear();
1001
- model->devices.push_back(main_gpu);
1002
- }
1003
- }
1004
-
1005
- for (auto * dev : model->devices) {
1006
- ggml_backend_dev_props props;
1007
- ggml_backend_dev_get_props(dev, &props);
1008
- LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
1009
- ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
1010
- props.device_id ? props.device_id : "unknown id",
1011
- props.memory_free/1024/1024);
1012
- }
1013
-
1014
- const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
1015
- GGML_ASSERT(status <= 0);
1016
- if (status < 0) {
1017
- if (status == -1) {
1018
- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
1019
- } else if (status == -2) {
1020
- LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
1021
- }
1022
-
1023
- llama_model_free(model);
1024
- return nullptr;
1025
- }
1026
-
1027
- return model;
1028
- }
1029
-
1030
- struct llama_model * llama_model_init_from_user(
1031
- struct gguf_context * metadata,
1032
- llama_model_set_tensor_data_t set_tensor_data,
1033
- void * set_tensor_data_ud,
1034
- struct llama_model_params params) {
1035
- GGML_ASSERT(metadata != nullptr);
1036
- std::string path_model;
1037
- std::vector<std::string> splits = {};
1038
- params.use_mmap = false;
1039
- params.use_extra_bufts = false;
1040
- return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
1041
- }
1042
- // deprecated
1043
- struct llama_model * llama_load_model_from_file(
1044
- const char * path_model,
1045
- struct llama_model_params params) {
1046
- return llama_model_load_from_file(path_model, params);
1047
- }
1048
-
1049
- struct llama_model * llama_model_load_from_file(
1050
- const char * path_model,
1051
- struct llama_model_params params) {
1052
- std::vector<std::string> splits = {};
1053
- return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
1054
- }
1055
-
1056
- struct llama_model * llama_model_load_from_splits(
1057
- const char ** paths,
1058
- size_t n_paths,
1059
- struct llama_model_params params) {
1060
- std::vector<std::string> splits;
1061
- if (n_paths == 0) {
1062
- LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
1063
- return nullptr;
1064
- }
1065
- splits.reserve(n_paths);
1066
- for (size_t i = 0; i < n_paths; ++i) {
1067
- splits.push_back(paths[i]);
1068
- }
1069
- return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
1070
- }
1071
-
1072
- void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
1073
- llama_model_saver ms(model);
1074
- ms.add_kv_from_model();
1075
- ms.add_tensors_from_model();
1076
- ms.save(path_model);
1077
- }
1078
-
1079
- //
1080
- // chat templates
1081
- //
1082
-
1083
- int32_t llama_chat_apply_template(
1084
- const char * tmpl,
1085
- const struct llama_chat_message * chat,
1086
- size_t n_msg,
1087
- bool add_ass,
1088
- char * buf,
1089
- int32_t length) {
1090
- const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
1091
-
1092
- // format the chat to string
1093
- std::vector<const llama_chat_message *> chat_vec;
1094
- chat_vec.resize(n_msg);
1095
- for (size_t i = 0; i < n_msg; i++) {
1096
- chat_vec[i] = &chat[i];
1097
- }
1098
-
1099
- std::string formatted_chat;
1100
- llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
1101
- if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
1102
- return -1;
1103
- }
1104
- int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
1105
- if (res < 0) {
1106
- return res;
1107
- }
1108
- if (buf && length > 0) {
1109
- strncpy(buf, formatted_chat.c_str(), length);
1110
- }
1111
- return res;
1112
- }
1113
-
1114
- //
1115
- // model split
1116
- //
1117
-
1118
- int32_t llama_split_path(
1119
- char * split_path,
1120
- size_t maxlen,
1121
- const char * path_prefix,
1122
- int32_t split_no,
1123
- int32_t split_count) {
1124
-
1125
- static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
1126
-
1127
- const int written = snprintf(
1128
- split_path,
1129
- maxlen,
1130
- SPLIT_PATH_FORMAT,
1131
- path_prefix,
1132
- split_no + 1,
1133
- split_count
1134
- );
1135
-
1136
- if (written < 0 || (size_t) written >= maxlen) {
1137
- return 0;
1138
- }
1139
-
1140
- return (int32_t) written;
1141
- }
1142
-
1143
- int32_t llama_split_prefix(
1144
- char * split_prefix,
1145
- size_t maxlen,
1146
- const char * split_path,
1147
- int32_t split_no,
1148
- int32_t split_count) {
1149
-
1150
- const std::string str_split_path(split_path);
1151
-
1152
- char postfix[32];
1153
- snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
1154
-
1155
- const std::string str_postfix(postfix);
1156
- if (str_split_path.size() <= str_postfix.size()) {
1157
- return 0;
1158
- }
1159
-
1160
- const size_t size_prefix = str_split_path.size() - str_postfix.size();
1161
-
1162
- if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
1163
- const size_t copy_len = std::min(size_prefix + 1, maxlen);
1164
- snprintf(split_prefix, copy_len, "%s", split_path);
1165
-
1166
- return (int32_t) size_prefix;
1167
- }
1168
-
1169
- return 0;
1170
- }
1171
-
1172
- const char * llama_print_system_info(void) {
1173
- static std::string s;
1174
- s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
1175
-
1176
- for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
1177
- auto * reg = ggml_backend_reg_get(i);
1178
- auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
1179
- if (get_features_fn) {
1180
- ggml_backend_feature * features = get_features_fn(reg);
1181
- s += ggml_backend_reg_name(reg);
1182
- s += " : ";
1183
- for (; features->name; features++) {
1184
- s += features->name;
1185
- s += " = ";
1186
- s += features->value;
1187
- s += " | ";
1188
- }
1189
- }
1190
- }
1191
-
1192
- return s.c_str();
1193
- }
1194
-