whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -1,1655 +0,0 @@
1
- #include "llama-model-loader.h"
2
-
3
- #include "ggml-alloc.h"
4
- #include "ggml.h"
5
- #include "gguf.h"
6
- #include "llama-hparams.h"
7
-
8
- #include <algorithm>
9
- #include <array>
10
- #include <cinttypes>
11
- #include <cstdint>
12
- #include <cstring>
13
- #include <future>
14
- #include <regex>
15
-
16
- static const size_t kiB = 1024;
17
- static const size_t MiB = 1024*kiB;
18
- static const size_t GiB = 1024*MiB;
19
-
20
- const char * llama_file_version_name(llama_fver version) {
21
- switch (version) {
22
- case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
23
- case GGUF_FILE_VERSION_V2: return "GGUF V2";
24
- case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
25
- }
26
-
27
- return "unknown";
28
- }
29
-
30
- static std::string llama_model_ftype_name(llama_ftype ftype) {
31
- if (ftype & LLAMA_FTYPE_GUESSED) {
32
- return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
33
- }
34
-
35
- switch (ftype) {
36
- case LLAMA_FTYPE_ALL_F32: return "all F32";
37
- case LLAMA_FTYPE_MOSTLY_F16: return "F16";
38
- case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
39
- case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
40
- case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
41
- case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
42
- case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
43
- case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
44
- case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
45
- case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
46
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
47
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
48
- case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
49
- case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
50
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
51
- case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
52
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
53
- case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
54
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
55
- case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
56
- case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
57
- case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
58
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
59
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
60
- case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
61
- case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
62
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
63
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
64
- case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
65
- case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
66
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
67
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
68
- case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
69
- case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
70
-
71
- default: return "unknown, may not work";
72
- }
73
- }
74
-
75
- // return a list of splits for a given path
76
- // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
77
- static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
78
- std::vector<std::string> paths;
79
- std::string split_prefix;
80
- std::vector<char> buf(llama_path_max(), 0);
81
-
82
- {
83
- int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
84
- if (!ret) {
85
- throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
86
- }
87
- split_prefix = std::string(buf.data(), ret);
88
- }
89
-
90
- if (split_prefix.empty()) {
91
- throw std::runtime_error(format("invalid split file: %s", path.c_str()));
92
- }
93
-
94
- for (int idx = 0; idx < n_split; ++idx) {
95
- int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
96
- paths.push_back(std::string(buf.data(), ret));
97
- }
98
-
99
- return paths;
100
- }
101
-
102
- namespace GGUFMeta {
103
- template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
104
- struct GKV_Base_Type {
105
- static constexpr gguf_type gt = gt_;
106
-
107
- static T getter(const gguf_context * ctx, const int kid) {
108
- return gfun(ctx, kid);
109
- }
110
- };
111
-
112
- template<typename T> struct GKV_Base;
113
-
114
- template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
115
- template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
116
- template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
117
- template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
118
- template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
119
- template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
120
- template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
121
- template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
122
- template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
123
- template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
124
- template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
125
- template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
126
-
127
- template<> struct GKV_Base<std::string> {
128
- static constexpr gguf_type gt = GGUF_TYPE_STRING;
129
-
130
- static std::string getter(const gguf_context * ctx, const int kid) {
131
- return gguf_get_val_str(ctx, kid);
132
- }
133
- };
134
-
135
- struct ArrayInfo {
136
- const gguf_type gt;
137
- const size_t length;
138
- const void * data;
139
- };
140
-
141
- template<> struct GKV_Base<ArrayInfo> {
142
- public:
143
- static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
144
- static ArrayInfo getter(const gguf_context *ctx, const int k) {
145
- const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
146
- return ArrayInfo {
147
- arr_type,
148
- size_t(gguf_get_arr_n(ctx, k)),
149
- arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
150
- };
151
- }
152
- };
153
-
154
- template<typename T>
155
- class GKV : public GKV_Base<T> {
156
- GKV() = delete;
157
-
158
- public:
159
- static T get_kv(const gguf_context * ctx, const int k) {
160
- const enum gguf_type kt = gguf_get_kv_type(ctx, k);
161
-
162
- if (kt != GKV::gt) {
163
- throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
164
- gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
165
- }
166
- return GKV::getter(ctx, k);
167
- }
168
-
169
- static const char * override_type_to_str(const llama_model_kv_override_type ty) {
170
- switch (ty) {
171
- case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
172
- case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
173
- case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
174
- case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
175
- }
176
- return "unknown";
177
- }
178
-
179
- static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
180
- if (!ovrd) { return false; }
181
- if (ovrd->tag == expected_type) {
182
- LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
183
- __func__, override_type_to_str(ovrd->tag), ovrd->key);
184
- switch (ovrd->tag) {
185
- case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
186
- LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
187
- } break;
188
- case LLAMA_KV_OVERRIDE_TYPE_INT: {
189
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
190
- } break;
191
- case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
192
- LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
193
- } break;
194
- case LLAMA_KV_OVERRIDE_TYPE_STR: {
195
- LLAMA_LOG_INFO("%s\n", ovrd->val_str);
196
- } break;
197
- default:
198
- // Shouldn't be possible to end up here, but just in case...
199
- throw std::runtime_error(
200
- format("Unsupported attempt to override %s type for metadata key %s\n",
201
- override_type_to_str(ovrd->tag), ovrd->key));
202
- }
203
- return true;
204
- }
205
- LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
206
- __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
207
- return false;
208
- }
209
-
210
- template<typename OT>
211
- static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
212
- try_override(OT & target, const struct llama_model_kv_override * ovrd) {
213
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
214
- target = ovrd->val_bool;
215
- return true;
216
- }
217
- return false;
218
- }
219
-
220
- template<typename OT>
221
- static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
222
- try_override(OT & target, const struct llama_model_kv_override * ovrd) {
223
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
224
- target = ovrd->val_i64;
225
- return true;
226
- }
227
- return false;
228
- }
229
-
230
- template<typename OT>
231
- static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
232
- try_override(T & target, const struct llama_model_kv_override * ovrd) {
233
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
234
- target = ovrd->val_f64;
235
- return true;
236
- }
237
- return false;
238
- }
239
-
240
- template<typename OT>
241
- static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
242
- try_override(T & target, const struct llama_model_kv_override * ovrd) {
243
- if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
244
- target = ovrd->val_str;
245
- return true;
246
- }
247
- return false;
248
- }
249
-
250
- static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
251
- if (try_override<T>(target, ovrd)) {
252
- return true;
253
- }
254
- if (k < 0) { return false; }
255
- target = get_kv(ctx, k);
256
- return true;
257
- }
258
-
259
- static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
260
- return set(ctx, gguf_find_key(ctx, key), target, ovrd);
261
- }
262
-
263
- static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
264
- return set(ctx, key.c_str(), target, ovrd);
265
- }
266
- };
267
- }
268
-
269
- template<typename T>
270
- typename std::enable_if<std::is_integral<T>::value, bool>::type
271
- llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
272
- const int kid = gguf_find_key(metadata, key.c_str());
273
-
274
- if (kid < 0) {
275
- if (required) {
276
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
277
- }
278
- return false;
279
- }
280
-
281
- struct GGUFMeta::ArrayInfo arr_info =
282
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
283
-
284
-
285
- result = arr_info.length;
286
- return true;
287
- }
288
-
289
- template<typename T>
290
- typename std::enable_if<std::is_integral<T>::value, bool>::type
291
- llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) {
292
- return get_arr_n(llm_kv(kid), result, required);
293
- }
294
-
295
- template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required);
296
-
297
- template<typename T>
298
- bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
299
- const gguf_context * ctx = metadata;
300
- const int kid = gguf_find_key(ctx, key.c_str());
301
-
302
- if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
303
- if (required) {
304
- throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
305
- }
306
- return false;
307
- }
308
-
309
- struct GGUFMeta::ArrayInfo arr_info =
310
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
311
-
312
- switch (arr_info.gt) {
313
- case GGUF_TYPE_UINT32:
314
- case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
315
- (std::is_same<T, uint32_t>::value)); break;
316
- case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
317
- case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
318
- default:
319
- throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
320
- }
321
-
322
- if constexpr (std::is_same<T, std::string>::value) {
323
- const size_t n_items = gguf_get_arr_n(ctx, kid);
324
- result.clear();
325
-
326
- for (size_t i = 0; i < n_items; i++) {
327
- const T value = gguf_get_arr_str(ctx, kid, i);
328
- result.emplace_back(value);
329
- }
330
- } else {
331
- result.resize(arr_info.length);
332
- result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
333
- }
334
-
335
- return true;
336
- }
337
-
338
- template<typename T, size_t N_MAX>
339
- bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
340
- const gguf_context * ctx = metadata;
341
- const int kid = gguf_find_key(ctx, key.c_str());
342
-
343
- if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
344
- if (required) {
345
- throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
346
- }
347
- return false;
348
- }
349
-
350
- struct GGUFMeta::ArrayInfo arr_info =
351
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
352
-
353
- switch (arr_info.gt) {
354
- case GGUF_TYPE_BOOL:
355
- case GGUF_TYPE_UINT32:
356
- case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
357
- (std::is_same<T, uint32_t>::value)); break;
358
- case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
359
- case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
360
- default:
361
- throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
362
- }
363
-
364
- if (arr_info.length > N_MAX) {
365
- throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
366
- }
367
-
368
- if constexpr (std::is_same<T, std::string>::value) {
369
- const size_t n_items = gguf_get_arr_n(ctx, kid);
370
-
371
- for (size_t i = 0; i < n_items; i++) {
372
- const T value = gguf_get_arr_str(ctx, kid, i);
373
- result[i] = value;
374
- }
375
- } else {
376
- if (arr_info.gt == GGUF_TYPE_BOOL) {
377
- std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
378
- return static_cast<T>(x);
379
- });
380
- } else {
381
- std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
382
- }
383
- }
384
-
385
- return true;
386
- }
387
-
388
- template<typename T>
389
- bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) {
390
- return get_arr(llm_kv(kid), result, required);
391
- }
392
-
393
- template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
394
-
395
- template<typename T>
396
- bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
397
- auto it = kv_overrides.find(key);
398
-
399
- const struct llama_model_kv_override * override =
400
- it != kv_overrides.end() ? &it->second : nullptr;
401
-
402
- const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
403
-
404
- if (required && !found) {
405
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
406
- }
407
-
408
- return found;
409
- }
410
-
411
- template<typename T>
412
- bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) {
413
- return get_key(llm_kv(kid), result, required);
414
- }
415
-
416
- template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
417
- template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
418
- template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
419
- template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
420
-
421
- template<>
422
- bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) {
423
- uint32_t tmp;
424
- const bool found = get_key(kid, tmp, required);
425
- if (found) {
426
- result = (enum llama_pooling_type) tmp;
427
- } else {
428
- result = LLAMA_POOLING_TYPE_UNSPECIFIED;
429
- }
430
- return found;
431
- }
432
-
433
- // get array of n <= N_MAX elements, or a single element repeated n times
434
- template<typename T, size_t N_MAX>
435
- bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
436
- const int kid = gguf_find_key(metadata, key.c_str());
437
-
438
- if (kid < 0) {
439
- if (required) {
440
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
441
- }
442
- return false;
443
- }
444
-
445
- if (n > N_MAX) {
446
- throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
447
- }
448
-
449
- if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
450
- struct GGUFMeta::ArrayInfo arr_info =
451
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
452
-
453
- if (n != arr_info.length) {
454
- throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
455
- }
456
-
457
- return get_arr(key, result, required);
458
- }
459
-
460
- T value;
461
-
462
- bool ok = get_key(key, value, required);
463
- if (!ok) {
464
- return false;
465
- }
466
-
467
- for (uint32_t i = 0; i < n; i++) {
468
- result[i] = value;
469
- }
470
-
471
- return true;
472
- }
473
-
474
- template<typename T>
475
- bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) {
476
- return get_key_or_arr(llm_kv(kid), result, n, required);
477
- }
478
-
479
- bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
480
- const std::string key = llm_kv(kid);
481
-
482
- const int id = gguf_find_key(metadata, key.c_str());
483
-
484
- if (id < 0) {
485
- if (required) {
486
- throw std::runtime_error(format("key not found in model: %s", key.c_str()));
487
- }
488
- return false;
489
- }
490
-
491
- // throw and error if type is an array
492
- if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
493
- if (required) {
494
- throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
495
- }
496
- return false;
497
- }
498
-
499
- return get_key(key, result, required);
500
- }
501
-
502
- // TODO: this is not very clever - figure out something better
503
- template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
504
- template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
505
- template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
506
-
507
-
508
- llama_model_loader::llama_model_loader(
509
- struct gguf_context * meta,
510
- llama_model_set_tensor_data_t set_tensor_data,
511
- void * set_tensor_data_ud,
512
- const std::string & fname,
513
- std::vector<std::string> & splits,
514
- bool use_mmap,
515
- bool use_direct_io,
516
- bool check_tensors,
517
- bool no_alloc,
518
- const llama_model_kv_override * param_overrides_p,
519
- const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
520
- : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
521
- int trace = 0;
522
- if (getenv("LLAMA_TRACE")) {
523
- trace = atoi(getenv("LLAMA_TRACE"));
524
- }
525
-
526
- if (param_overrides_p != nullptr) {
527
- for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
528
- kv_overrides.insert({std::string(p->key), *p});
529
- }
530
- }
531
-
532
- tensor_buft_overrides = param_tensor_buft_overrides_p;
533
-
534
- if (!fname.empty()) {
535
- // Load the main GGUF
536
- struct ggml_context * ctx = NULL;
537
- struct gguf_init_params params = {
538
- /*.no_alloc = */ true,
539
- /*.ctx = */ &ctx,
540
- };
541
-
542
- metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
543
- metadata = metadata_ptr.get();
544
- if (metadata == nullptr) {
545
- throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
546
- }
547
-
548
- get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
549
- llm_kv = LLM_KV(llm_arch_from_string(arch_name));
550
-
551
- files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
552
- contexts.emplace_back(ctx);
553
-
554
- if (use_mmap && use_direct_io) {
555
- if (files.back()->has_direct_io()) {
556
- LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
557
- use_mmap = false;
558
- } else {
559
- LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
560
- use_direct_io = false;
561
-
562
- // reopen file using std::fopen for mmap
563
- files.pop_back();
564
- files.emplace_back(new llama_file(fname.c_str(), "rb", false));
565
- }
566
- }
567
-
568
- // Save tensors data offset of the main file.
569
- // For subsidiary files, `meta` tensor data offset must not be used,
570
- // so we build a unified tensors index for weights.
571
- for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
572
- std::string tensor_name = std::string(cur->name);
573
- // make sure there is no duplicated tensor names
574
- if (weights_map.find(tensor_name) != weights_map.end()) {
575
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
576
- }
577
- n_elements += ggml_nelements(cur);
578
- n_bytes += ggml_nbytes(cur);
579
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
580
- }
581
- uint16_t n_split = 0;
582
- get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
583
-
584
- // Load additional GGML contexts
585
- if (n_split > 1) {
586
- // make sure the main file is loaded first
587
- uint16_t idx = 0;
588
- const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
589
- get_key(kv_split_no, idx);
590
- if (idx != 0) {
591
- throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
592
- }
593
-
594
- // generate list of splits if needed
595
- if (splits.empty()) {
596
- splits = llama_get_list_splits(fname, idx, n_split);
597
- }
598
-
599
- // in case user give a custom list of splits, check if it matches the expected number
600
- if (n_split != (uint16_t)splits.size()) {
601
- throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
602
- }
603
-
604
- if (trace > 0) {
605
- LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
606
- }
607
-
608
- // load other splits
609
- for (idx = 1; idx < n_split; idx++) {
610
- const char * fname_split = splits[idx].c_str();
611
-
612
- struct gguf_init_params split_params = {
613
- /*.no_alloc = */ true,
614
- /*.ctx = */ &ctx,
615
- };
616
- gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
617
- if (!ctx_gguf) {
618
- throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
619
- }
620
-
621
- // check idx
622
- {
623
- const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
624
- if (kid < 0) {
625
- throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
626
- }
627
- int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
628
- if (idx_gguf != idx) {
629
- throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
630
- }
631
- }
632
-
633
- files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
634
- contexts.emplace_back(ctx);
635
-
636
- // Save tensors data offset info of the shard.
637
- for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
638
- std::string tensor_name = std::string(cur->name);
639
- // make sure there is no duplicated tensor names
640
- if (weights_map.find(tensor_name) != weights_map.end()) {
641
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
642
- }
643
- n_elements += ggml_nelements(cur);
644
- n_bytes += ggml_nbytes(cur);
645
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
646
- }
647
- }
648
-
649
- get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
650
-
651
- // sanity check
652
- {
653
- const int n_tensors_loaded = (int) weights_map.size();
654
- if (n_tensors != n_tensors_loaded) {
655
- throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
656
- }
657
- }
658
-
659
- LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
660
- }
661
- } else {
662
- get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
663
- llm_kv = LLM_KV(llm_arch_from_string(arch_name));
664
- }
665
-
666
- n_kv = gguf_get_n_kv(metadata);
667
- n_tensors = weights_map.size();
668
-
669
- fver = (enum llama_fver) gguf_get_version(metadata);
670
-
671
- LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
672
- __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
673
-
674
- // determine file type based on the number of tensors for each quantization and print meta data
675
- // TODO: make optional
676
- {
677
- std::map<enum ggml_type, uint32_t> n_type;
678
-
679
- uint32_t n_type_max = 0;
680
- enum ggml_type type_max = GGML_TYPE_F32;
681
-
682
- for (const auto & it : weights_map) {
683
- const llama_tensor_weight & w = it.second;
684
- const ggml_tensor * tensor = w.tensor;
685
-
686
- enum ggml_type type = tensor->type;
687
-
688
- n_type[type]++;
689
-
690
- if (n_type_max < n_type[type]) {
691
- n_type_max = n_type[type];
692
- type_max = type;
693
- }
694
-
695
- if (trace > 0) {
696
- const uint16_t sid = w.idx;
697
- LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
698
- sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
699
- ggml_nbytes(tensor)/1024.0f/1024.0f);
700
- }
701
- }
702
-
703
- switch (type_max) {
704
- case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
705
- case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
706
- case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
707
- case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
708
- case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
709
- case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
710
- case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
711
- case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
712
- case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
713
- case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
714
- case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
715
- case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
716
- case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
717
- case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
718
- case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
719
- case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
720
- case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
721
- case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
722
- case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
723
- case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
724
- case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
725
- case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
726
- case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
727
- case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
728
- case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
729
- default:
730
- {
731
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
732
- ftype = LLAMA_FTYPE_ALL_F32;
733
- } break;
734
- }
735
-
736
- // this is a way to mark that we have "guessed" the file type
737
- ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
738
-
739
- {
740
- uint32_t ftype_val = 0;
741
- if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) {
742
- ftype = (llama_ftype) ftype_val;
743
- }
744
- }
745
-
746
- LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
747
-
748
- for (int i = 0; i < n_kv; i++) {
749
- const char * name = gguf_get_key(metadata, i);
750
- const enum gguf_type type = gguf_get_kv_type(metadata, i);
751
- const std::string type_name =
752
- type == GGUF_TYPE_ARRAY
753
- ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
754
- : gguf_type_name(type);
755
-
756
- std::string value = gguf_kv_to_str(metadata, i);
757
- const size_t MAX_VALUE_LEN = 40;
758
- if (value.size() > MAX_VALUE_LEN) {
759
- value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
760
- }
761
- replace_all(value, "\n", "\\n");
762
-
763
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
764
- }
765
-
766
- // print type counts
767
- for (auto & kv : n_type) {
768
- if (kv.second == 0) {
769
- continue;
770
- }
771
-
772
- LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
773
- }
774
- }
775
-
776
- if (!llama_mmap::SUPPORTED) {
777
- LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
778
- use_mmap = false;
779
- }
780
-
781
- this->use_mmap = use_mmap;
782
- this->use_direct_io = use_direct_io;
783
- this->check_tensors = check_tensors;
784
- this->no_alloc = no_alloc;
785
- }
786
-
787
- std::string llama_model_loader::get_arch_name() const {
788
- return arch_name;
789
- }
790
-
791
- enum llm_arch llama_model_loader::get_arch() const {
792
- return llm_kv.arch;
793
- }
794
-
795
- const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const {
796
- auto pos = weights_map.find(name);
797
- if (pos != weights_map.end()) {
798
- return &pos->second;
799
- }
800
-
801
- return nullptr;
802
- }
803
-
804
- const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const {
805
- const llama_tensor_weight * weight = get_weight(name);
806
- if (!weight) {
807
- throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
808
- }
809
- return *weight;
810
- }
811
-
812
- struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const {
813
- const auto * weight = get_weight(name);
814
- if (!weight) {
815
- return nullptr;
816
- }
817
- return weight->tensor;
818
- }
819
-
820
- struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const {
821
- struct ggml_tensor * tensor = get_tensor_meta(name.c_str());
822
- if (!tensor) {
823
- throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
824
- }
825
- return tensor;
826
- }
827
-
828
- const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
829
- const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
830
-
831
- if (cur == NULL) {
832
- if (!required) {
833
- return NULL;
834
- }
835
- throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
836
- }
837
-
838
- {
839
- bool is_ok = true;
840
- for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
841
- if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
842
- is_ok = false;
843
- break;
844
- }
845
- }
846
- if (!is_ok) {
847
- throw std::runtime_error(
848
- format("%s: tensor '%s' has wrong shape; expected %s, got %s",
849
- __func__, name.c_str(),
850
- llama_format_tensor_shape(ne).c_str(),
851
- llama_format_tensor_shape(cur).c_str()));
852
- }
853
- }
854
-
855
- return cur;
856
- }
857
-
858
- // checks if the weight tensor can be used with the specified buffer type and device
859
- static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
860
- GGML_ASSERT(w != nullptr);
861
-
862
- if (op == GGML_OP_NONE) {
863
- return true;
864
- }
865
-
866
- ggml_init_params params = {
867
- /*.mem_size =*/ ggml_tensor_overhead()*8,
868
- /*.mem_buffer =*/ NULL,
869
- /*.no_alloc =*/ true,
870
- };
871
- ggml_context_ptr ctx_ptr { ggml_init(params) };
872
- if (!ctx_ptr) {
873
- throw std::runtime_error(format("failed to create ggml context"));
874
- }
875
- ggml_context * ctx = ctx_ptr.get();
876
-
877
- ggml_tensor * op_tensor = nullptr;
878
-
879
- switch (op) {
880
- case GGML_OP_GET_ROWS:
881
- {
882
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
883
- op_tensor = ggml_get_rows(ctx, w, b);
884
- } break;
885
- case GGML_OP_MUL_MAT:
886
- {
887
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
888
- op_tensor = ggml_mul_mat(ctx, w, b);
889
- } break;
890
- case GGML_OP_MUL_MAT_ID:
891
- {
892
- const int n_expert_used = hparams.n_expert_used;
893
- GGML_ASSERT(n_expert_used > 0);
894
- ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
895
- ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
896
- op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
897
- } break;
898
- case GGML_OP_ADD:
899
- {
900
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
901
- op_tensor = ggml_add(ctx, a, w);
902
- } break;
903
- case GGML_OP_ADD_ID:
904
- {
905
- const int n_expert_used = hparams.n_expert_used;
906
- GGML_ASSERT(n_expert_used > 0);
907
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
908
- ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
909
- op_tensor = ggml_add_id(ctx, a, w, c);
910
- } break;
911
- case GGML_OP_MUL:
912
- {
913
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
914
- op_tensor = ggml_mul(ctx, a, w);
915
- } break;
916
- case GGML_OP_DIV:
917
- {
918
- ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
919
- op_tensor = ggml_div(ctx, a, w);
920
- } break;
921
- case GGML_OP_ROPE:
922
- {
923
- const int n_embd_head = hparams.n_embd_head_v();
924
- const int n_head = hparams.n_head();
925
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
926
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
927
- op_tensor = ggml_rope_ext(
928
- ctx, a, b, w,
929
- 0, 0, 0, 0, 0,
930
- 0, 0, 0, 0
931
- );
932
-
933
- } break;
934
- case GGML_OP_SSM_CONV:
935
- {
936
- const int64_t n_seq_tokens = 512;
937
- const int64_t n_seqs = 3;
938
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
939
- op_tensor = ggml_ssm_conv(ctx, conv_x, w);
940
- } break;
941
- case GGML_OP_SSM_SCAN:
942
- {
943
- // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
944
- const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
945
- const int64_t n_head = w->ne[1];
946
- const int64_t head_dim = hparams.ssm_d_inner / n_head;
947
- const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
948
- const int64_t n_seq_tokens = 512;
949
- const int64_t n_seqs = 3;
950
- ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
951
- ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
952
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
953
- ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
954
- ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
955
- ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
956
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
957
- } break;
958
- case GGML_OP_RWKV_WKV6:
959
- {
960
- // FIXME
961
- const int64_t S = 123;
962
- const int64_t H = 123;
963
- const int64_t n_tokens = 123;
964
- const int64_t n_seqs = 123;
965
- ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
966
- ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
967
- ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
968
- ggml_tensor * tf = w;
969
- ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
970
- ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
971
- op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
972
- } break;
973
- case GGML_OP_IM2COL:
974
- {
975
- const int n_embd_inp = hparams.n_embd_inp();
976
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
977
- op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
978
- } break;
979
- case GGML_OP_SCALE:
980
- {
981
- op_tensor = ggml_scale(ctx, w, 1.0f);
982
- } break;
983
- default:
984
- GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
985
- }
986
-
987
- // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
988
- GGML_ASSERT(w->buffer == nullptr);
989
- w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
990
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
991
- ggml_backend_buffer_free(w->buffer);
992
- w->buffer = nullptr;
993
-
994
- return op_supported;
995
- }
996
-
997
- // find the first buffer type in the list that can use the tensor
998
- static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
999
- GGML_ASSERT(!buft_list->empty());
1000
- for (const auto & cur : *buft_list) {
1001
- ggml_backend_dev_t cur_dev = cur.first;
1002
- ggml_backend_buffer_type_t cur_buft = cur.second;
1003
- if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
1004
- return cur_buft;
1005
- }
1006
- }
1007
-
1008
- return nullptr;
1009
- }
1010
-
1011
- struct ggml_tensor * llama_model_loader::create_tensor(
1012
- const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
1013
- const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
1014
- auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
1015
- auto it = ctx_map.find(buft);
1016
- if (it == ctx_map.end()) {
1017
- // one ggml context per buffer type
1018
- int max_n_tensors = n_tensors;
1019
- max_n_tensors += 1; // duplicated output tensor
1020
- max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
1021
- if (files.empty()) {
1022
- max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
1023
- }
1024
- const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
1025
-
1026
- ggml_init_params params = {
1027
- /*.mem_size =*/ ctx_size,
1028
- /*.mem_buffer =*/ NULL,
1029
- /*.no_alloc =*/ true,
1030
- };
1031
-
1032
- ggml_context * ctx = ggml_init(params);
1033
- if (!ctx) {
1034
- throw std::runtime_error(format("failed to create ggml context"));
1035
- }
1036
-
1037
- ctx_map.emplace(buft, ctx);
1038
-
1039
- return ctx;
1040
- }
1041
- return it->second.get();
1042
- };
1043
-
1044
- auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
1045
- if (!t_meta) {
1046
- if (flags & TENSOR_NOT_REQUIRED) {
1047
- return nullptr;
1048
- }
1049
- throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
1050
- }
1051
-
1052
- // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
1053
- // the tensor is duplicated
1054
- // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
1055
- llm_tensor tn_tensor = tn.tensor;
1056
- if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
1057
- tn_tensor = LLM_TENSOR_OUTPUT;
1058
- }
1059
-
1060
- llm_tensor_info info;
1061
- try {
1062
- info = llm_tensor_info_for(tn_tensor);
1063
- } catch (const std::out_of_range & e) {
1064
- throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
1065
- }
1066
-
1067
- // skip unused tensors
1068
- if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
1069
- const size_t nbytes = ggml_nbytes(t_meta);
1070
- LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
1071
-
1072
- size_data -= nbytes;
1073
- n_created++;
1074
-
1075
- return nullptr;
1076
- }
1077
-
1078
- // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
1079
- ggml_op op;
1080
- bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
1081
- if (bias) {
1082
- if (info.op == GGML_OP_MUL_MAT_ID) {
1083
- op = GGML_OP_ADD_ID;
1084
- } else {
1085
- op = GGML_OP_ADD;
1086
- }
1087
- } else {
1088
- op = info.op;
1089
- }
1090
-
1091
- // sanity checks
1092
- if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
1093
- if (tn.bid != -1) {
1094
- GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
1095
- }
1096
- } else {
1097
- if (tn.bid == -1) {
1098
- GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
1099
- }
1100
- }
1101
-
1102
- // select the buffer type for this tensor
1103
- const buft_list_t * buft_list;
1104
- switch (info.layer) {
1105
- case LLM_TENSOR_LAYER_INPUT:
1106
- buft_list = buft_list_input;
1107
- break;
1108
- case LLM_TENSOR_LAYER_OUTPUT:
1109
- buft_list = buft_list_output;
1110
- break;
1111
- case LLM_TENSOR_LAYER_REPEATING:
1112
- GGML_ASSERT(buft_list_layer != nullptr);
1113
- buft_list = buft_list_layer;
1114
- break;
1115
- default:
1116
- GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
1117
- }
1118
-
1119
- ggml_backend_buffer_type_t buft = nullptr;
1120
-
1121
- // check overrides
1122
- if (tensor_buft_overrides) {
1123
- std::string tensor_name = tn.str();
1124
- for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1125
- std::regex pattern(overrides->pattern);
1126
- if (std::regex_search(tensor_name, pattern)) {
1127
- if (overrides->buft == ggml_backend_cpu_buffer_type()) {
1128
- // when overriding to a CPU buffer, consider the extra buffer types
1129
- buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
1130
- } else {
1131
- buft = overrides->buft;
1132
- }
1133
-
1134
- LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
1135
- tensor_name.c_str(),
1136
- ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
1137
- ggml_backend_buft_name(buft));
1138
- break;
1139
- }
1140
- }
1141
- }
1142
-
1143
- if (!buft) {
1144
- buft = select_weight_buft(hparams, t_meta, op, buft_list);
1145
- if (!buft) {
1146
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1147
- }
1148
- }
1149
-
1150
- // avoid using a host buffer when using mmap
1151
- auto * buft_dev = ggml_backend_buft_get_device(buft);
1152
- if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
1153
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1154
- if (!cpu_dev) {
1155
- throw std::runtime_error("no CPU backend found");
1156
- }
1157
- buft = ggml_backend_dev_buffer_type(cpu_dev);
1158
- }
1159
-
1160
- if (buft != buft_list->front().second) {
1161
- if (n_tensors_moved == 0) {
1162
- first_tensor_moved_name = t_meta->name;
1163
- first_tensor_moved_type_name = ggml_type_name(t_meta->type);
1164
- first_moved_from_buft = buft_list->front().second;
1165
- first_moved_to_buft = buft;
1166
- }
1167
- n_tensors_moved++;
1168
- }
1169
-
1170
- return buft;
1171
- };
1172
-
1173
- if (files.empty()) {
1174
- if (flags & TENSOR_SKIP_IF_VIRTUAL) {
1175
- return nullptr;
1176
- }
1177
- ggml_type type = GGML_TYPE_F32;
1178
- const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
1179
- if (tid != -1) {
1180
- type = gguf_get_tensor_type(metadata, tid);
1181
- }
1182
-
1183
- // for tensors that are not required some of the dimensions can be invalid:
1184
- if (flags & TENSOR_NOT_REQUIRED) {
1185
- for (size_t dim = 0; dim < ne.size(); dim++) {
1186
- if (ne.begin()[dim] <= 0) {
1187
- return nullptr;
1188
- }
1189
- }
1190
- }
1191
-
1192
- ggml_tensor t_meta;
1193
- memset(&t_meta, 0, sizeof(ggml_tensor));
1194
- t_meta.type = type;
1195
- for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
1196
- t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
1197
- GGML_ASSERT(t_meta.ne[dim] >= 1);
1198
- t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
1199
- GGML_ASSERT(t_meta.nb[dim] >= 1);
1200
- }
1201
- ggml_set_name(&t_meta, tn.str().c_str());
1202
-
1203
- ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
1204
- GGML_ASSERT(buft != nullptr);
1205
- ggml_context * ctx = ctx_for_buft(buft);
1206
- ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
1207
- ggml_set_name(ret, tn.str().c_str());
1208
- return ret;
1209
- }
1210
-
1211
- ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
1212
- ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
1213
- if (buft == nullptr) {
1214
- return nullptr; // return type is ggml_tensor *
1215
- }
1216
- ggml_context * ctx = ctx_for_buft(buft);
1217
-
1218
- // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
1219
- if (flags & TENSOR_DUPLICATED) {
1220
- ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
1221
- if (t) {
1222
- return t;
1223
- }
1224
- }
1225
-
1226
- LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
1227
- const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
1228
-
1229
- if (cur == NULL) {
1230
- return NULL;
1231
- }
1232
-
1233
- const bool duplicated = flags & TENSOR_DUPLICATED;
1234
-
1235
- struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
1236
- ggml_set_name(tensor, ggml_get_name(cur));
1237
-
1238
- if (duplicated) {
1239
- size_data += ggml_nbytes(cur);
1240
- } else {
1241
- n_created++;
1242
- }
1243
-
1244
- return tensor;
1245
- }
1246
-
1247
- struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
1248
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
1249
-
1250
- if (cur == NULL) {
1251
- return NULL;
1252
- }
1253
-
1254
- if (cur->type != base->type) {
1255
- throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
1256
- }
1257
-
1258
- std::array<int64_t, GGML_MAX_DIMS> dims;
1259
- for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
1260
- dims[i] = i < ne.size() ? ne.begin()[i] : 1;
1261
- }
1262
-
1263
- struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
1264
- dims[0], dims[1], dims[2], dims[3],
1265
- cur->nb[1], cur->nb[2], cur->nb[3],
1266
- offset);
1267
-
1268
- ggml_set_name(tensor, name.c_str());
1269
-
1270
- n_created++;
1271
-
1272
- return tensor;
1273
- }
1274
-
1275
- void llama_model_loader::done_getting_tensors() const {
1276
- if (n_created != n_tensors) {
1277
- throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
1278
- }
1279
- if (n_tensors_moved > 0) {
1280
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
1281
- __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
1282
- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
1283
- }
1284
- }
1285
-
1286
- void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
1287
- if (use_mmap) {
1288
- mappings.reserve(files.size());
1289
- mmaps_used.reserve(files.size());
1290
- for (const auto & file : files) {
1291
- bool is_numa = false;
1292
-
1293
- auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1294
- if (dev) {
1295
- auto * reg = ggml_backend_dev_backend_reg(dev);
1296
- auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
1297
- if (is_numa_fn) {
1298
- is_numa = is_numa_fn();
1299
- }
1300
- }
1301
-
1302
- std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
1303
- mmaps_used.emplace_back(mapping->size(), 0);
1304
- if (mlock_mmaps) {
1305
- std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
1306
- mlock_mmap->init(mapping->addr());
1307
- mlock_mmaps->emplace_back(std::move(mlock_mmap));
1308
- }
1309
- mappings.emplace_back(std::move(mapping));
1310
- }
1311
- }
1312
-
1313
- // compute the total size of all tensors for progress reporting
1314
- for (const auto & it : weights_map) {
1315
- size_data += ggml_nbytes(it.second.tensor);
1316
- }
1317
- }
1318
-
1319
- void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
1320
- GGML_ASSERT(!mappings.empty());
1321
- const auto & mapping = mappings.at(idx);
1322
-
1323
- *first = mapping->size();
1324
- *last = 0;
1325
- *addr = mapping->addr();
1326
- for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
1327
- const auto * weight = get_weight(ggml_get_name(tensor));
1328
- if (!weight || weight->idx != idx) {
1329
- continue;
1330
- }
1331
- *first = std::min(*first, weight->offs);
1332
- *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
1333
- }
1334
- }
1335
-
1336
- void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
1337
- const auto & w = require_weight(ggml_get_name(cur));
1338
-
1339
- if (use_mmap) {
1340
- const auto & mapping = mappings.at(w.idx);
1341
- if (cur->data == nullptr) {
1342
- cur->data = (uint8_t *)mapping->addr() + w.offs;
1343
- } else {
1344
- memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur));
1345
- }
1346
- } else {
1347
- GGML_ASSERT(cur->data != nullptr);
1348
- GGML_ASSERT(w.idx < files.size());
1349
- const auto & file = files.at(w.idx);
1350
- file->seek(w.offs, SEEK_SET);
1351
- file->read_raw(cur->data, ggml_nbytes(cur));
1352
- }
1353
-
1354
- if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
1355
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
1356
- }
1357
- }
1358
-
1359
- bool llama_model_loader::load_all_data(
1360
- struct ggml_context * ctx,
1361
- llama_buf_map & bufs,
1362
- llama_mlocks * lmlocks,
1363
- llama_progress_callback progress_callback,
1364
- void * progress_callback_user_data) {
1365
- if (files.empty()) {
1366
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
1367
- set_tensor_data(t, set_tensor_data_ud);
1368
- }
1369
- return true;
1370
- }
1371
- GGML_ASSERT(size_data != 0 && "call init_mappings() first");
1372
-
1373
- std::vector<no_init<uint8_t>> read_buf;
1374
- std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
1375
-
1376
- // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
1377
- // NVMe raid configurations might require more / larger buffers.
1378
- constexpr size_t n_buffers = 4;
1379
-
1380
- size_t alignment = 1;
1381
- for (const auto & file : files) {
1382
- alignment = std::max(file->read_alignment(), alignment);
1383
- }
1384
-
1385
- // Buffer size: balance between memory usage and I/O efficiency
1386
- // 64MB works well for NVMe drives
1387
- const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
1388
-
1389
- std::vector<ggml_backend_buffer_t> host_buffers;
1390
- std::vector<ggml_backend_event_t> events;
1391
- std::vector<void *> host_ptrs;
1392
- size_t buffer_idx = 0; // buffer to use for async loads
1393
- ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t {
1394
- if (use_mmap || check_tensors) {
1395
- return nullptr;
1396
- }
1397
- // When not using mmaped io use async uploads from pinned memory to GPU memory.
1398
- // First determine if the backend supports the necessary features for async uploads.
1399
- auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
1400
- if (!buf) {
1401
- LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func);
1402
- return nullptr;
1403
- }
1404
-
1405
- auto * buft = ggml_backend_buffer_get_type(buf);
1406
- auto * dev = ggml_backend_buft_get_device(buft);
1407
- if (!dev) {
1408
- LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func,
1409
- ggml_backend_buft_name(buft));
1410
- return nullptr;
1411
- }
1412
-
1413
- if (buft != ggml_backend_dev_buffer_type(dev)) {
1414
- LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func,
1415
- ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
1416
- return nullptr;
1417
- }
1418
-
1419
- ggml_backend_dev_props props;
1420
- ggml_backend_dev_get_props(dev, &props);
1421
- if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
1422
- LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func,
1423
- ggml_backend_dev_name(dev));
1424
- return nullptr;
1425
- }
1426
-
1427
- auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
1428
- if (!host_buft) {
1429
- LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func,
1430
- ggml_backend_dev_name(dev));
1431
- return nullptr;
1432
- }
1433
-
1434
- // If the backend is supported, create pinned memory buffers and events for synchronisation.
1435
- for (size_t idx = 0; idx < n_buffers; ++idx) {
1436
- auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
1437
-
1438
- if (!buf) {
1439
- LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
1440
- ggml_backend_dev_name(dev));
1441
- return nullptr;
1442
- }
1443
-
1444
- host_buffers.emplace_back(buf);
1445
- host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
1446
-
1447
- auto * event = ggml_backend_event_new(dev);
1448
- if (!event) {
1449
- LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func,
1450
- ggml_backend_dev_name(dev));
1451
- return nullptr;
1452
- }
1453
-
1454
- events.emplace_back(event);
1455
- }
1456
-
1457
- ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
1458
- if (!backend) {
1459
- LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func,
1460
- ggml_backend_dev_name(dev));
1461
- return nullptr;
1462
- }
1463
-
1464
- return backend;
1465
- }(__func__);
1466
-
1467
- if (upload_backend) {
1468
- LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
1469
- ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
1470
- ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
1471
- ggml_backend_name(upload_backend));
1472
- }
1473
-
1474
- for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
1475
- const auto * weight = get_weight(ggml_get_name(cur));
1476
- if (weight == nullptr) {
1477
- // this can happen with split experts models
1478
- continue;
1479
- }
1480
-
1481
- if (progress_callback) {
1482
- if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
1483
- return false;
1484
- }
1485
- }
1486
-
1487
- size_t n_size = ggml_nbytes(cur);
1488
-
1489
- if (use_mmap) {
1490
- const auto & mapping = mappings.at(weight->idx);
1491
- ggml_backend_buffer_t buf_mmap = nullptr;
1492
- if (bufs.count(weight->idx)) {
1493
- buf_mmap = bufs.at(weight->idx);
1494
- }
1495
- uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
1496
-
1497
- if (check_tensors) {
1498
- validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
1499
- return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
1500
- }));
1501
- }
1502
-
1503
- GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
1504
- if (buf_mmap && cur->data == nullptr) {
1505
- ggml_backend_tensor_alloc(buf_mmap, cur, data);
1506
- if (lmlocks) {
1507
- const auto & lmlock = lmlocks->at(weight->idx);
1508
- lmlock->grow_to(weight->offs + n_size);
1509
- }
1510
-
1511
- auto & mmap_used = mmaps_used[weight->idx];
1512
- mmap_used.first = std::min(mmap_used.first, weight->offs);
1513
- mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
1514
- } else {
1515
- ggml_backend_tensor_set(cur, data, 0, n_size);
1516
- }
1517
- } else {
1518
- const auto & file = files.at(weight->idx);
1519
-
1520
- if (ggml_backend_buffer_is_host(cur->buffer)) {
1521
- file->seek(weight->offs, SEEK_SET);
1522
- file->read_raw(cur->data, n_size);
1523
- if (check_tensors) {
1524
- validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1525
- return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
1526
- }));
1527
- }
1528
- } else {
1529
- // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1530
- if (upload_backend) {
1531
- size_t offset = weight->offs;
1532
- alignment = file->read_alignment();
1533
- size_t aligned_offset = offset & ~(alignment - 1);
1534
- size_t offset_from_alignment = offset - aligned_offset;
1535
- file->seek(aligned_offset, SEEK_SET);
1536
-
1537
- // Calculate aligned read boundaries
1538
- size_t read_start = aligned_offset;
1539
- size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
1540
-
1541
- size_t bytes_read = 0;
1542
- size_t data_read = 0; // Actual tensor data copied (excluding padding)
1543
-
1544
- while (bytes_read < read_end - read_start) {
1545
- size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
1546
-
1547
- // Align the destination pointer within the pinned buffer
1548
- uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
1549
-
1550
- // Wait for previous upload to complete before reusing buffer
1551
- ggml_backend_event_synchronize(events[buffer_idx]);
1552
-
1553
- // Read aligned chunk from file
1554
- file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1555
-
1556
- // Calculate actual data portion (excluding alignment padding)
1557
- uintptr_t ptr_data = ptr_dest_aligned;
1558
- size_t data_to_copy = read_size;
1559
-
1560
- // Skip alignment padding at start of first chunk
1561
- if (bytes_read == 0) {
1562
- ptr_data += offset_from_alignment;
1563
- data_to_copy -= offset_from_alignment;
1564
- }
1565
-
1566
- // Trim alignment padding at end of last chunk
1567
- if (aligned_offset + bytes_read + read_size > offset + n_size) {
1568
- data_to_copy -= (read_end - (offset + n_size));
1569
- }
1570
-
1571
- // Async upload actual data to GPU
1572
- ggml_backend_tensor_set_async(upload_backend, cur,
1573
- reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
1574
- ggml_backend_event_record(events[buffer_idx], upload_backend);
1575
-
1576
- data_read += data_to_copy;
1577
- bytes_read += read_size;
1578
-
1579
- ++buffer_idx;
1580
- buffer_idx %= n_buffers;
1581
- }
1582
- } else {
1583
- read_buf.resize(n_size);
1584
- file->seek(weight->offs, SEEK_SET);
1585
- file->read_raw(read_buf.data(), n_size);
1586
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1587
- if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1588
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
1589
- }
1590
- }
1591
- }
1592
- }
1593
-
1594
- size_done += n_size;
1595
- }
1596
-
1597
- // free temporary resources used for async uploads
1598
- for (auto * event : events) {
1599
- ggml_backend_event_synchronize(event);
1600
- ggml_backend_event_free(event);
1601
- }
1602
- for (auto * buf : host_buffers) {
1603
- ggml_backend_buffer_free(buf);
1604
- }
1605
- ggml_backend_free(upload_backend);
1606
-
1607
- // check validation results
1608
- bool validation_failed = false;
1609
- for (auto & future : validation_result) {
1610
- auto result = future.get();
1611
- if (!result.second) {
1612
- LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
1613
- validation_failed = true;
1614
- }
1615
- }
1616
- if (validation_failed) {
1617
- throw std::runtime_error("found tensors with invalid data");
1618
- }
1619
-
1620
- // check if this is the last call and do final cleanup
1621
- if (size_done >= size_data) {
1622
- // unmap offloaded tensors and metadata
1623
- if (use_mmap) {
1624
- for (uint32_t idx = 0; idx < mappings.size(); idx++) {
1625
- const auto & mmap_used = mmaps_used.at(idx);
1626
- auto & mapping = mappings.at(idx);
1627
- mapping->unmap_fragment(0, mmap_used.first);
1628
- if (mmap_used.second != 0) {
1629
- mapping->unmap_fragment(mmap_used.second, mapping->size());
1630
- }
1631
- }
1632
- }
1633
- if (progress_callback) {
1634
- // Even though the model is done loading, we still honor
1635
- // cancellation since we need to free allocations.
1636
- return progress_callback(1.0f, progress_callback_user_data);
1637
- }
1638
- }
1639
-
1640
- return true;
1641
- }
1642
-
1643
- std::string llama_model_loader::ftype_name() const {
1644
- return llama_model_ftype_name(ftype);
1645
- }
1646
-
1647
- void llama_model_loader::print_info() const {
1648
- LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
1649
- LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
1650
- if (n_bytes < GiB) {
1651
- LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
1652
- } else {
1653
- LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
1654
- }
1655
- }