whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -1,1165 +0,0 @@
1
- #include "llama-memory-recurrent.h"
2
-
3
- #include "llama-impl.h"
4
- #include "llama-io.h"
5
- #include "llama-batch.h"
6
- #include "llama-model.h"
7
-
8
- #include <algorithm>
9
- #include <cassert>
10
- #include <cstring>
11
- #include <limits>
12
- #include <map>
13
- #include <stdexcept>
14
-
15
- //
16
- // llama_memory_recurrent
17
- //
18
-
19
- llama_memory_recurrent::llama_memory_recurrent(
20
- const llama_model & model,
21
- ggml_type type_r,
22
- ggml_type type_s,
23
- bool offload,
24
- uint32_t mem_size,
25
- uint32_t n_seq_max,
26
- const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
27
- const int32_t n_layer = hparams.n_layer;
28
-
29
- head = 0;
30
- size = mem_size;
31
- used = 0;
32
-
33
- cells.clear();
34
- cells.resize(mem_size);
35
-
36
- // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
37
- struct ggml_backend_buft_comparator {
38
- bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
39
- return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
40
- }
41
- };
42
- std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
43
-
44
- // create a context for each buffer type
45
- auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
46
- auto it = ctx_map.find(buft);
47
- if (it == ctx_map.end()) {
48
- ggml_init_params params = {
49
- /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
50
- /*.mem_buffer =*/ NULL,
51
- /*.no_alloc =*/ true,
52
- };
53
-
54
- ggml_context * ctx = ggml_init(params);
55
- if (!ctx) {
56
- return nullptr;
57
- }
58
-
59
- ctx_map.emplace(buft, ctx);
60
-
61
- return ctx;
62
- }
63
-
64
- return it->second.get();
65
- };
66
-
67
- r_l.resize(n_layer);
68
- s_l.resize(n_layer);
69
-
70
- for (int i = 0; i < n_layer; i++) {
71
- if (filter && !filter(i)) {
72
- LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, i);
73
- continue;
74
- }
75
-
76
- const char * dev_name = "CPU";
77
-
78
- ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
79
-
80
- if (offload) {
81
- auto * dev = model.dev_layer(i);
82
- buft = ggml_backend_dev_buffer_type(dev);
83
-
84
- dev_name = ggml_backend_dev_name(dev);
85
- }
86
-
87
- LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
88
-
89
- ggml_context * ctx = ctx_for_buft(buft);
90
- if (!ctx) {
91
- throw std::runtime_error("failed to create ggml context for rs cache");
92
- }
93
-
94
- ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
95
- ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
96
- ggml_format_name(r, "cache_r_l%d", i);
97
- ggml_format_name(s, "cache_s_l%d", i);
98
- r_l[i] = r;
99
- s_l[i] = s;
100
- }
101
-
102
- // allocate tensors and initialize the buffers to avoid NaNs in the padding
103
- for (auto & [buft, ctx] : ctx_map) {
104
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
105
- if (!buf) {
106
- throw std::runtime_error("failed to allocate buffer for rs cache");
107
- }
108
- ggml_backend_buffer_clear(buf, 0);
109
- LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
110
- ctxs_bufs.emplace_back(std::move(ctx), buf);
111
- }
112
-
113
- {
114
- const size_t memory_size_r = size_r_bytes();
115
- const size_t memory_size_s = size_s_bytes();
116
-
117
- LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
118
- (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
119
- ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
120
- ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
121
- }
122
- }
123
-
124
- void llama_memory_recurrent::clear(bool data) {
125
- for (int32_t i = 0; i < (int32_t) size; ++i) {
126
- cells[i].pos = -1;
127
- cells[i].seq_id.clear();
128
- cells[i].src = -1;
129
- cells[i].tail = -1;
130
- }
131
-
132
- head = 0;
133
- used = 0;
134
-
135
- if (data) {
136
- for (auto & [_, buf] : ctxs_bufs) {
137
- ggml_backend_buffer_clear(buf.get(), 0);
138
- }
139
- }
140
- }
141
-
142
- bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
143
- //printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
144
- uint32_t new_head = size;
145
-
146
- if (p0 < 0) {
147
- p0 = 0;
148
- }
149
-
150
- if (p1 < 0) {
151
- p1 = std::numeric_limits<llama_pos>::max();
152
- }
153
-
154
- // models like Mamba or RWKV can't have a state partially erased at the end
155
- // of the sequence because their state isn't preserved for previous tokens
156
- if (seq_id >= (int64_t) size) {
157
- // could be fatal
158
- return false;
159
- }
160
- if (0 <= seq_id) {
161
- int32_t & tail_id = cells[seq_id].tail;
162
- if (tail_id >= 0) {
163
- const auto & cell = cells[tail_id];
164
- // partial intersection is invalid if it includes the final pos
165
- if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
166
- //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
167
- return false;
168
- }
169
- // invalidate tails which will be cleared
170
- if (p0 <= cell.pos && cell.pos < p1) {
171
- tail_id = -1;
172
- }
173
- }
174
- } else {
175
- // seq_id is negative, then the range should include everything or nothing
176
- if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
177
- //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
178
- return false;
179
- }
180
- }
181
-
182
- for (uint32_t i = 0; i < size; ++i) {
183
- if (cells[i].pos >= p0 && cells[i].pos < p1) {
184
- if (seq_id < 0) {
185
- cells[i].seq_id.clear();
186
- } else if (cells[i].has_seq_id(seq_id)) {
187
- cells[i].seq_id.erase(seq_id);
188
- } else {
189
- continue;
190
- }
191
- if (cells[i].is_empty()) {
192
- // keep count of the number of used cells
193
- if (cells[i].pos >= 0) {
194
- used--;
195
- }
196
- cells[i].pos = -1;
197
- cells[i].src = -1;
198
- if (new_head == size) {
199
- new_head = i;
200
- }
201
- }
202
- }
203
- }
204
-
205
- // If we freed up a slot, set head to it so searching can start there.
206
- if (new_head != size && new_head < head) {
207
- head = new_head;
208
- }
209
-
210
- return true;
211
- }
212
-
213
- void llama_memory_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
214
- if (seq_id_src == seq_id_dst) {
215
- return;
216
- }
217
-
218
- if (p0 < 0) {
219
- p0 = 0;
220
- }
221
-
222
- if (p1 < 0) {
223
- p1 = std::numeric_limits<llama_pos>::max();
224
- }
225
-
226
- if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
227
- auto & tail_src = cells[seq_id_src];
228
- auto & tail_dst = cells[seq_id_dst];
229
- if (tail_dst.tail >= 0) {
230
- // clear destination seq_id if it wasn't empty
231
- auto & cell_dst = cells[tail_dst.tail];
232
-
233
- cell_dst.seq_id.erase(seq_id_dst);
234
- tail_dst.tail = -1;
235
- if (cell_dst.seq_id.empty()) {
236
- cell_dst.pos = -1;
237
- cell_dst.src = -1;
238
- used -= 1;
239
- }
240
- }
241
- if (tail_src.tail >= 0) {
242
- auto & cell_src = cells[tail_src.tail];
243
-
244
- cell_src.seq_id.insert(seq_id_dst);
245
- tail_dst.tail = tail_src.tail;
246
- }
247
- }
248
- }
249
-
250
- void llama_memory_recurrent::seq_keep(llama_seq_id seq_id) {
251
- uint32_t new_head = size;
252
-
253
- for (uint32_t i = 0; i < size; ++i) {
254
- if ((llama_seq_id) i != seq_id) {
255
- cells[i].tail = -1;
256
- }
257
-
258
- if (!cells[i].has_seq_id(seq_id)) {
259
- if (cells[i].pos >= 0) {
260
- used--;
261
- }
262
-
263
- cells[i].pos = -1;
264
- cells[i].src = -1;
265
- cells[i].seq_id.clear();
266
-
267
- if (new_head == size){
268
- new_head = i;
269
- }
270
- } else {
271
- cells[i].seq_id.clear();
272
- cells[i].seq_id.insert(seq_id);
273
- }
274
- }
275
-
276
- // If we freed up a slot, set head to it so searching can start there.
277
- if (new_head != size && new_head < head) {
278
- head = new_head;
279
- }
280
- }
281
-
282
- void llama_memory_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
283
- if (shift == 0) {
284
- return;
285
- }
286
-
287
- if (p0 < 0) {
288
- p0 = 0;
289
- }
290
-
291
- if (p1 < 0) {
292
- p1 = std::numeric_limits<llama_pos>::max();
293
- }
294
-
295
- // If there is no range then return early to avoid looping over the
296
- if (p0 == p1) {
297
- return;
298
- }
299
-
300
- // for Mamba-like or RWKV models, only the pos needs to be shifted
301
- if (0 <= seq_id && seq_id < (int64_t) size) {
302
- const int32_t tail_id = cells[seq_id].tail;
303
- if (tail_id >= 0) {
304
- auto & cell = cells[tail_id];
305
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
306
- cell.pos += shift;
307
- }
308
- }
309
- }
310
- }
311
-
312
- void llama_memory_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
313
- if (d == 1) {
314
- return;
315
- }
316
-
317
- if (p0 < 0) {
318
- p0 = 0;
319
- }
320
-
321
- if (p1 < 0) {
322
- p1 = std::numeric_limits<llama_pos>::max();
323
- }
324
-
325
- // If there is no range then return early to avoid looping over the cache.
326
- if (p0 == p1) {
327
- return;
328
- }
329
-
330
- // for Mamba-like or RWKV models, only the pos needs to be changed
331
- if (0 <= seq_id && seq_id < (int64_t) size) {
332
- const int32_t tail_id = cells[seq_id].tail;
333
- if (tail_id >= 0) {
334
- auto & cell = cells[tail_id];
335
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
336
- cell.pos /= d;
337
- }
338
- }
339
- }
340
- }
341
-
342
- llama_pos llama_memory_recurrent::seq_pos_min(llama_seq_id seq_id) const {
343
- llama_pos result = std::numeric_limits<llama_pos>::max();
344
-
345
- for (uint32_t i = 0; i < size; ++i) {
346
- if (cells[i].has_seq_id(seq_id)) {
347
- result = std::min(result, cells[i].pos);
348
- }
349
- }
350
-
351
- if (result == std::numeric_limits<llama_pos>::max()) {
352
- result = -1;
353
- }
354
-
355
- return result;
356
- }
357
-
358
- llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
359
- llama_pos result = -1;
360
-
361
- for (uint32_t i = 0; i < size; ++i) {
362
- if (cells[i].has_seq_id(seq_id)) {
363
- result = std::max(result, cells[i].pos);
364
- }
365
- }
366
-
367
- return result;
368
- }
369
-
370
- std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
371
- std::map<ggml_backend_buffer_type_t, size_t> ret;
372
- for (const auto & [_, buf] : ctxs_bufs) {
373
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
374
- }
375
- return ret;
376
- }
377
-
378
- llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
379
- do {
380
- balloc.split_reset();
381
-
382
- std::vector<llama_ubatch> ubatches;
383
- while (true) {
384
- llama_ubatch ubatch;
385
-
386
- if (embd_all) {
387
- // if all tokens are output, split by sequence
388
- ubatch = balloc.split_seq(n_ubatch);
389
- } else {
390
- // TODO: non-sequential equal split can be done if using unified KV cache
391
- // for simplicity, we always use sequential equal split for now
392
- ubatch = balloc.split_equal(n_ubatch, true);
393
- }
394
-
395
- if (ubatch.n_tokens == 0) {
396
- break;
397
- }
398
-
399
- ubatches.push_back(std::move(ubatch)); // NOLINT
400
- }
401
-
402
- if (balloc.get_n_used() < balloc.get_n_tokens()) {
403
- // failed to find a suitable split
404
- break;
405
- }
406
-
407
- if (!prepare(ubatches)) {
408
- break;
409
- }
410
-
411
- return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
412
- } while (false);
413
-
414
- return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
415
- }
416
-
417
- llama_memory_context_ptr llama_memory_recurrent::init_full() {
418
- return std::make_unique<llama_memory_recurrent_context>(this);
419
- }
420
-
421
- llama_memory_context_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) {
422
- GGML_UNUSED(lctx);
423
- GGML_UNUSED(optimize);
424
-
425
- return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_NO_UPDATE);
426
- }
427
-
428
- bool llama_memory_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
429
- // simply remember the full state because it is very small for this type of cache
430
- // TODO: optimize
431
- auto org_cells = cells;
432
- auto org_used = used;
433
- auto org_head = head;
434
-
435
- bool success = true;
436
-
437
- for (const auto & ubatch : ubatches) {
438
- if (!find_slot(ubatch)) {
439
- success = false;
440
- break;
441
- }
442
- }
443
-
444
- // restore the original state
445
- cells = std::move(org_cells);
446
- used = org_used;
447
- head = org_head;
448
-
449
- return success;
450
- }
451
-
452
- bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
453
- const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
454
- const uint32_t n_seqs = ubatch.n_seqs;
455
-
456
- // if we have enough unused cells before the current head ->
457
- // better to start searching from the beginning of the cache, hoping to fill it
458
- if (head > used + 2*n_seqs) {
459
- head = 0;
460
- }
461
-
462
- // For recurrent state architectures (like Mamba or RWKV),
463
- // each cache cell can store the state for a whole sequence.
464
- // A slot should be always be contiguous.
465
-
466
- // can only process batches with an equal number of new tokens in each sequence
467
- GGML_ASSERT(ubatch.equal_seqs());
468
-
469
- int32_t min = size - 1;
470
- int32_t max = 0;
471
-
472
- // everything should fit if all seq_ids are smaller than the max
473
- for (uint32_t s = 0; s < n_seqs; ++s) {
474
- const uint32_t i = s*n_seq_tokens; // first token of sequence set s
475
- const uint32_t n_seq_id = ubatch.n_seq_id[i];
476
-
477
- for (uint32_t j = 0; j < n_seq_id; ++j) {
478
- const llama_seq_id seq_id = ubatch.seq_id[i][j];
479
-
480
- if (seq_id < 0 || (uint32_t) seq_id >= size) {
481
- // too big seq_id
482
- // TODO: would it be possible to resize the cache instead?
483
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
484
- return false;
485
- }
486
- if (j > 0) {
487
- auto & seq = cells[seq_id];
488
- if (seq.tail >= 0) {
489
- auto & cell = cells[seq.tail];
490
- // clear cells from seq_ids that become shared
491
- // (should not normally happen, but let's handle it anyway)
492
- cell.seq_id.erase(seq_id);
493
- seq.tail = -1;
494
- if (cell.seq_id.empty()) {
495
- cell.pos = -1;
496
- cell.src = -1;
497
- used -= 1;
498
- }
499
- }
500
- }
501
- }
502
- }
503
-
504
- #ifndef NDEBUG
505
- {
506
- std::vector<int32_t> tails_verif;
507
- tails_verif.assign(size, -1);
508
- for (uint32_t i = 0; i < size; ++i) {
509
- auto & cell = cells[i];
510
- for (llama_seq_id seq_id : cell.seq_id) {
511
- if (tails_verif[seq_id] != -1) {
512
- LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
513
- }
514
- tails_verif[seq_id] = i;
515
- }
516
- }
517
- for (uint32_t i = 0; i < size; ++i) {
518
- if (tails_verif[i] != cells[i].tail) {
519
- LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
520
- }
521
- }
522
- }
523
- #endif
524
-
525
- // find next empty cell
526
- uint32_t next_empty_cell = head;
527
-
528
- for (uint32_t i = 0; i < size; ++i) {
529
- if (next_empty_cell >= size) { next_empty_cell -= size; }
530
- auto & cell = cells[next_empty_cell];
531
- if (cell.is_empty()) { break; }
532
- next_empty_cell += 1;
533
- }
534
-
535
- // find usable cell range
536
- for (uint32_t s = 0; s < n_seqs; ++s) {
537
- const uint32_t i = s*n_seq_tokens;
538
- const llama_seq_id seq_id = ubatch.seq_id[i][0];
539
- auto & seq_meta = cells[seq_id];
540
- bool has_cell = false;
541
- if (seq_meta.tail >= 0) {
542
- auto & cell = cells[seq_meta.tail];
543
- GGML_ASSERT(cell.has_seq_id(seq_id));
544
- // does this seq_id "own" the cell?
545
- if (cell.seq_id.size() == 1) { has_cell = true; }
546
- }
547
- if (!has_cell) {
548
- auto & empty_cell = cells[next_empty_cell];
549
- GGML_ASSERT(empty_cell.is_empty());
550
- // copy old tail into the empty cell
551
- if (seq_meta.tail >= 0) {
552
- auto & orig_cell = cells[seq_meta.tail];
553
- empty_cell.pos = orig_cell.pos;
554
- empty_cell.src = orig_cell.src;
555
- orig_cell.seq_id.erase(seq_id);
556
- empty_cell.seq_id.insert(seq_id); // will be overwritten
557
- GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
558
- }
559
- seq_meta.tail = next_empty_cell;
560
- // find next empty cell
561
- if (s + 1 < n_seqs) {
562
- for (uint32_t j = 0; j < size; ++j) {
563
- next_empty_cell += 1;
564
- if (next_empty_cell >= size) { next_empty_cell -= size; }
565
- auto & cell = cells[next_empty_cell];
566
- if (cell.is_empty()) { break; }
567
- }
568
- }
569
- }
570
- if (min > seq_meta.tail) { min = seq_meta.tail; }
571
- if (max < seq_meta.tail) { max = seq_meta.tail; }
572
- }
573
-
574
- // gather and re-order
575
- for (uint32_t s = 0; s < n_seqs; ++s) {
576
- const uint32_t i = s*n_seq_tokens;
577
- const int32_t dst_id = s + min;
578
- const int32_t src_id = cells[ubatch.seq_id[i][0]].tail;
579
- if (dst_id != src_id) {
580
- auto & dst_cell = cells[dst_id];
581
- auto & src_cell = cells[src_id];
582
-
583
- std::swap(dst_cell.pos, src_cell.pos);
584
- std::swap(dst_cell.src, src_cell.src);
585
- std::swap(dst_cell.seq_id, src_cell.seq_id);
586
-
587
- // swap tails
588
- for (uint32_t j = 0; j < size; ++j) {
589
- int32_t & tail = cells[j].tail;
590
- if (tail == src_id) {
591
- tail = dst_id;
592
- } else if (tail == dst_id) {
593
- tail = src_id;
594
- }
595
- }
596
- }
597
- }
598
-
599
- // update the pos of the used seqs
600
- for (uint32_t s = 0; s < n_seqs; ++s) {
601
- const uint32_t i = s*n_seq_tokens;
602
- const llama_pos last_pos = ubatch.pos[i + n_seq_tokens - 1];
603
- const int32_t cell_id = s + min;
604
- auto & cell = cells[cell_id];
605
-
606
- if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
607
- // What should happen when the pos backtracks or skips a value?
608
- // Clearing the state mid-batch would require special-casing which isn't done.
609
- LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
610
- __func__, last_pos, cell.pos, ubatch.seq_id[i][0], n_seq_tokens);
611
- }
612
- cell.pos = last_pos;
613
- cell.seq_id.clear();
614
- for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
615
- const llama_seq_id seq_id = ubatch.seq_id[i][j];
616
- cell.seq_id.insert(seq_id);
617
- cells[seq_id].tail = cell_id;
618
- }
619
- }
620
-
621
- // Find first cell without src refs, to use as the zero-ed state
622
- {
623
- // TODO: bake-in src refcounts in the cell metadata
624
- std::vector<int32_t> refcounts(size, 0);
625
- for (size_t i = 0; i < size; ++i) {
626
- const int32_t src = cells[i].src;
627
- if (src >= 0) {
628
- refcounts[src] += 1;
629
- }
630
- }
631
-
632
- rs_z = -1;
633
- for (int i = min; i <= max; ++i) {
634
- if (refcounts[i] == 0) {
635
- rs_z = i;
636
- break;
637
- }
638
- }
639
-
640
- for (int i = min; i <= max; ++i) {
641
- if (cells[i].src < 0) {
642
- GGML_ASSERT(rs_z >= 0);
643
- cells[i].src0 = rs_z;
644
- } else {
645
- // Stage the source ids for all used cells to allow correct seq_* behavior
646
- // and still make these values available when setting the inputs
647
- cells[i].src0 = cells[i].src;
648
- }
649
- cells[i].src = i; // avoid moving or clearing twice
650
- }
651
- }
652
-
653
- // allow getting the range of used cells, from head to head + n
654
- head = min;
655
- n = max - min + 1;
656
- used = std::count_if(cells.begin(), cells.end(),
657
- [](const mem_cell & cell){ return !cell.is_empty(); });
658
-
659
- // sanity check
660
- return n >= n_seqs;
661
- }
662
-
663
- bool llama_memory_recurrent::get_can_shift() const {
664
- // shifting the pos is trivial for recurrent models
665
- return true;
666
- }
667
-
668
- size_t llama_memory_recurrent::total_size() const {
669
- size_t size = 0;
670
- for (const auto & [_, buf] : ctxs_bufs) {
671
- size += ggml_backend_buffer_get_size(buf.get());
672
- }
673
-
674
- return size;
675
- }
676
-
677
- size_t llama_memory_recurrent::size_r_bytes() const {
678
- size_t size_r_bytes = 0;
679
-
680
- for (const auto & r : r_l) {
681
- if (r != nullptr) {
682
- size_r_bytes += ggml_nbytes(r);
683
- }
684
- }
685
-
686
- return size_r_bytes;
687
- }
688
-
689
- size_t llama_memory_recurrent::size_s_bytes() const {
690
- size_t size_s_bytes = 0;
691
-
692
- for (const auto & s : s_l) {
693
- if (s != nullptr) {
694
- size_s_bytes += ggml_nbytes(s);
695
- }
696
- }
697
-
698
- return size_s_bytes;
699
- }
700
-
701
- void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
702
- GGML_UNUSED(flags);
703
-
704
- std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
705
- uint32_t cell_count = 0;
706
-
707
- // Count the number of cells with the specified seq_id
708
- // Find all the ranges of cells with this seq id (or all, when -1)
709
- uint32_t cell_range_begin = size;
710
- for (uint32_t i = 0; i < size; ++i) {
711
- const auto & cell = cells[i];
712
- if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
713
- ++cell_count;
714
- if (cell_range_begin == size) {
715
- cell_range_begin = i;
716
- }
717
- } else {
718
- if (cell_range_begin != size) {
719
- cell_ranges.emplace_back(cell_range_begin, i);
720
- cell_range_begin = size;
721
- }
722
- }
723
- }
724
- if (cell_range_begin != size) {
725
- cell_ranges.emplace_back(cell_range_begin, size);
726
- }
727
-
728
- // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
729
- uint32_t cell_count_check = 0;
730
- for (const auto & range : cell_ranges) {
731
- cell_count_check += range.second - range.first;
732
- }
733
- GGML_ASSERT(cell_count == cell_count_check);
734
-
735
- io.write(&cell_count, sizeof(cell_count));
736
-
737
- state_write_meta(io, cell_ranges, seq_id);
738
- state_write_data(io, cell_ranges);
739
- }
740
-
741
- void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
742
- GGML_UNUSED(flags);
743
-
744
- uint32_t cell_count;
745
- io.read_to(&cell_count, sizeof(cell_count));
746
-
747
- bool res = true;
748
-
749
- res = res && state_read_meta(io, cell_count, seq_id);
750
- res = res && state_read_data(io, cell_count);
751
-
752
- if (!res) {
753
- if (seq_id == -1) {
754
- clear(true);
755
- } else {
756
- seq_rm(seq_id, -1, -1);
757
- }
758
- throw std::runtime_error("failed to restore kv cache");
759
- }
760
- }
761
-
762
- void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
763
- for (const auto & range : cell_ranges) {
764
- for (uint32_t i = range.first; i < range.second; ++i) {
765
- const auto & cell = cells[i];
766
- const llama_pos pos = cell.pos;
767
- const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
768
-
769
- io.write(&pos, sizeof(pos));
770
- io.write(&n_seq_id, sizeof(n_seq_id));
771
-
772
- if (n_seq_id) {
773
- for (auto seq_id : cell.seq_id) {
774
- io.write(&seq_id, sizeof(seq_id));
775
- }
776
- }
777
- }
778
- }
779
- }
780
-
781
- void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
782
- const uint32_t s_trans = 0;
783
- const uint32_t n_layer = hparams.n_layer;
784
-
785
- io.write(&s_trans, sizeof(s_trans));
786
- io.write(&n_layer, sizeof(n_layer));
787
-
788
- // Iterate and write all the R tensors first, each row is a cell
789
- // Get whole range at a time
790
- for (uint32_t il = 0; il < n_layer; ++il) {
791
- // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
792
- if (r_l[il] == nullptr) continue;
793
-
794
- // Write R tensor type
795
- const int32_t r_type_i = (int32_t)r_l[il]->type;
796
- io.write(&r_type_i, sizeof(r_type_i));
797
-
798
- // Write row size of R tensor
799
- const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
800
- io.write(&r_size_row, sizeof(r_size_row));
801
-
802
- // Write each range of cells of r_size_row length
803
- for (const auto & range : cell_ranges) {
804
- const size_t range_size = range.second - range.first;
805
- const size_t buf_size = range_size * r_size_row;
806
- io.write_tensor(r_l[il], range.first * r_size_row, buf_size);
807
- }
808
- }
809
-
810
- if (!s_trans) {
811
- for (uint32_t il = 0; il < n_layer; ++il) {
812
- // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
813
- if (s_l[il] == nullptr) continue;
814
-
815
- // Write S tensor type
816
- const int32_t s_type_i = (int32_t)s_l[il]->type;
817
- io.write(&s_type_i, sizeof(s_type_i));
818
-
819
- // Write row size of S tensor
820
- const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
821
- io.write(&s_size_row, sizeof(s_size_row));
822
-
823
- // Write each range of S tensor rows
824
- for (const auto & range : cell_ranges) {
825
- const size_t range_size = range.second - range.first;
826
- const size_t buf_size = range_size * s_size_row;
827
- io.write_tensor(s_l[il], range.first * s_size_row, buf_size);
828
- }
829
- }
830
- } else {
831
- // When S tensor is transposed, we also need the element size and get the element ranges from each row
832
- const uint32_t mem_size = size;
833
- for (uint32_t il = 0; il < n_layer; ++il) {
834
- // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
835
- if (s_l[il] == nullptr) continue;
836
-
837
- const uint32_t n_embd_s = hparams.n_embd_s();
838
-
839
- // Write S tensor type
840
- const int32_t s_type_i = (int32_t)s_l[il]->type;
841
- io.write(&s_type_i, sizeof(s_type_i));
842
-
843
- // Write element size
844
- const uint32_t s_size_el = ggml_type_size(s_l[il]->type);
845
- io.write(&s_size_el, sizeof(s_size_el));
846
-
847
- // Write GQA embedding size
848
- io.write(&n_embd_s, sizeof(n_embd_s));
849
-
850
- // For each row, we get the element values of each cell
851
- for (uint32_t j = 0; j < n_embd_s; ++j) {
852
- // Write each range of cells of s_size_el length
853
- for (const auto & range : cell_ranges) {
854
- const size_t range_size = range.second - range.first;
855
- const size_t src_offset = (range.first + j * mem_size) * s_size_el;
856
- const size_t buf_size = range_size * s_size_el;
857
- io.write_tensor(s_l[il], src_offset, buf_size);
858
- }
859
- }
860
- }
861
- }
862
- }
863
-
864
- bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
865
- if (dest_seq_id != -1) {
866
- // single sequence
867
- seq_rm(dest_seq_id, -1, -1);
868
-
869
- if (cell_count == 0) {
870
- return true;
871
- }
872
-
873
- llama_batch_allocr balloc(hparams.n_pos_per_embd());
874
-
875
- llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
876
-
877
- for (uint32_t i = 0; i < cell_count; ++i) {
878
- llama_pos pos;
879
- uint32_t n_seq_id;
880
-
881
- io.read_to(&pos, sizeof(pos));
882
- io.read_to(&n_seq_id, sizeof(n_seq_id));
883
-
884
- if (n_seq_id != 0) {
885
- LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
886
- return false;
887
- }
888
-
889
- ubatch.pos[i] = pos;
890
- }
891
- ubatch.n_seq_id[0] = 1;
892
- ubatch.seq_id[0] = &dest_seq_id;
893
-
894
- if (!find_slot(ubatch)) {
895
- LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
896
- return false;
897
- }
898
-
899
- // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
900
- // Assume that this is one contiguous block of cells
901
- GGML_ASSERT(head + cell_count <= size);
902
- GGML_ASSERT(cells[head].pos == ubatch.pos[0]);
903
- GGML_ASSERT(cells[head + cell_count - 1].pos == ubatch.pos[cell_count - 1]);
904
- GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
905
- GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
906
- } else {
907
- // whole KV cache restore
908
-
909
- if (cell_count > size) {
910
- LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
911
- return false;
912
- }
913
-
914
- clear(true);
915
-
916
- for (uint32_t i = 0; i < cell_count; ++i) {
917
- auto & cell = cells[i];
918
-
919
- llama_pos pos;
920
- uint32_t n_seq_id;
921
-
922
- io.read_to(&pos, sizeof(pos));
923
- io.read_to(&n_seq_id, sizeof(n_seq_id));
924
-
925
- cell.pos = pos;
926
-
927
- for (uint32_t j = 0; j < n_seq_id; ++j) {
928
- llama_seq_id seq_id;
929
- io.read_to(&seq_id, sizeof(seq_id));
930
-
931
- // TODO: llama_memory_recurrent should have a notion of max sequences
932
- //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
933
- if (seq_id < 0) {
934
- //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
935
- LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
936
- return false;
937
- }
938
-
939
- cell.seq_id.insert(seq_id);
940
-
941
- int32_t & tail = cells[seq_id].tail;
942
- if (tail != -1) {
943
- LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
944
- return false;
945
- }
946
- tail = i;
947
- }
948
- }
949
-
950
- head = 0;
951
- used = cell_count;
952
- }
953
-
954
- for (uint32_t i = 0; i < cell_count; ++i) {
955
- uint32_t cell_id = head + i;
956
- // make sure the recurrent states will keep their restored state
957
- cells[cell_id].src = cell_id;
958
- }
959
-
960
- return true;
961
- }
962
-
963
- bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
964
- uint32_t s_trans;
965
- uint32_t n_layer;
966
- io.read_to(&s_trans, sizeof(s_trans));
967
- io.read_to(&n_layer, sizeof(n_layer));
968
-
969
- if (n_layer != hparams.n_layer) {
970
- LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
971
- return false;
972
- }
973
- if (cell_count > size) {
974
- LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
975
- return false;
976
- }
977
- if (false != (bool) s_trans) {
978
- LLAMA_LOG_ERROR("%s: incompatible s transposition\n", __func__);
979
- return false;
980
- }
981
-
982
- // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
983
- for (uint32_t il = 0; il < n_layer; ++il) {
984
- // skip null layers
985
- if (r_l[il] == nullptr) continue;
986
-
987
- // Read type of key
988
- int32_t r_type_i_ref;
989
- io.read_to(&r_type_i_ref, sizeof(r_type_i_ref));
990
- const int32_t r_type_i = (int32_t) r_l[il]->type;
991
- if (r_type_i != r_type_i_ref) {
992
- LLAMA_LOG_ERROR("%s: mismatched r type (%d != %d, layer %d)\n", __func__, r_type_i, r_type_i_ref, il);
993
- return false;
994
- }
995
-
996
- // Read row size of key
997
- uint64_t r_size_row_ref;
998
- io.read_to(&r_size_row_ref, sizeof(r_size_row_ref));
999
- const size_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
1000
- if (r_size_row != r_size_row_ref) {
1001
- LLAMA_LOG_ERROR("%s: mismatched r row size (%zu != %zu, layer %d)\n", __func__, r_size_row, (size_t) r_size_row_ref, il);
1002
- return false;
1003
- }
1004
-
1005
- if (cell_count) {
1006
- // Read and set the keys for the whole cell range
1007
- ggml_backend_tensor_set(r_l[il], io.read(cell_count * r_size_row), head * r_size_row, cell_count * r_size_row);
1008
- }
1009
- }
1010
-
1011
- if (!s_trans) {
1012
- for (uint32_t il = 0; il < n_layer; ++il) {
1013
- // skip null layers
1014
- if (s_l[il] == nullptr) continue;
1015
-
1016
- // Read type of value
1017
- int32_t s_type_i_ref;
1018
- io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
1019
- const int32_t s_type_i = (int32_t)s_l[il]->type;
1020
-
1021
- if (s_type_i != s_type_i_ref) {
1022
- LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
1023
- return false;
1024
- }
1025
-
1026
- // Read row size of value
1027
- uint64_t s_size_row_ref;
1028
- io.read_to(&s_size_row_ref, sizeof(s_size_row_ref));
1029
- const size_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
1030
- if (s_size_row != s_size_row_ref) {
1031
- LLAMA_LOG_ERROR("%s: mismatched s row size (%zu != %zu, layer %d)\n", __func__, s_size_row, (size_t) s_size_row_ref, il);
1032
- return false;
1033
- }
1034
-
1035
- if (cell_count) {
1036
- // Read and set the values for the whole cell range
1037
- ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_row), head * s_size_row, cell_count * s_size_row);
1038
- }
1039
- }
1040
- } else {
1041
- // For each layer, read the values for each cell (transposed)
1042
- for (uint32_t il = 0; il < n_layer; ++il) {
1043
- // skip null layers
1044
- if (s_l[il] == nullptr) continue;
1045
-
1046
- const uint32_t n_embd_s = hparams.n_embd_s();
1047
-
1048
- // Read type of value
1049
- int32_t s_type_i_ref;
1050
- io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
1051
- const int32_t s_type_i = (int32_t)s_l[il]->type;
1052
- if (s_type_i != s_type_i_ref) {
1053
- LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
1054
- return false;
1055
- }
1056
-
1057
- // Read element size of value
1058
- uint32_t s_size_el_ref;
1059
- io.read_to(&s_size_el_ref, sizeof(s_size_el_ref));
1060
- const size_t s_size_el = ggml_type_size(s_l[il]->type);
1061
- if (s_size_el != s_size_el_ref) {
1062
- LLAMA_LOG_ERROR("%s: mismatched s element size (%zu != %zu, layer %d)\n", __func__, s_size_el, (size_t) s_size_el_ref, il);
1063
- return false;
1064
- }
1065
-
1066
- // Read state embedding size
1067
- uint32_t n_embd_s_ref;
1068
- io.read_to(&n_embd_s_ref, sizeof(n_embd_s_ref));
1069
- if (n_embd_s != n_embd_s_ref) {
1070
- LLAMA_LOG_ERROR("%s: mismatched s embedding size (%u != %u, layer %d)\n", __func__, n_embd_s, n_embd_s_ref, il);
1071
- return false;
1072
- }
1073
-
1074
- if (cell_count) {
1075
- // For each row in the transposed matrix, read the values for the whole cell range
1076
- for (uint32_t j = 0; j < n_embd_s; ++j) {
1077
- const size_t dst_offset = (head + j * size) * s_size_el;
1078
- ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_el), dst_offset, cell_count * s_size_el);
1079
- }
1080
- }
1081
- }
1082
- }
1083
-
1084
- return true;
1085
- }
1086
-
1087
- //
1088
- // llama_memory_recurrent_context
1089
- //
1090
-
1091
- llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {}
1092
-
1093
- llama_memory_recurrent_context::llama_memory_recurrent_context(
1094
- llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) {
1095
- }
1096
-
1097
- llama_memory_recurrent_context::llama_memory_recurrent_context(
1098
- llama_memory_recurrent * mem,
1099
- std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {}
1100
-
1101
- llama_memory_recurrent_context::~llama_memory_recurrent_context() = default;
1102
-
1103
- bool llama_memory_recurrent_context::next() {
1104
- assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1105
-
1106
- if (++i_next >= ubatches.size()) {
1107
- return false;
1108
- }
1109
-
1110
- return true;
1111
- }
1112
-
1113
- bool llama_memory_recurrent_context::apply() {
1114
- assert(!llama_memory_status_is_fail(status));
1115
-
1116
- // no ubatches -> this is an update
1117
- if (ubatches.empty()) {
1118
- // recurrent cache never performs updates
1119
- assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE);
1120
-
1121
- return true;
1122
- }
1123
-
1124
- mem->find_slot(ubatches[i_next]);
1125
-
1126
- return true;
1127
- }
1128
-
1129
- llama_memory_status llama_memory_recurrent_context::get_status() const {
1130
- return status;
1131
- }
1132
-
1133
- const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const {
1134
- assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1135
-
1136
- return ubatches[i_next];
1137
- }
1138
-
1139
- uint32_t llama_memory_recurrent_context::get_n_rs() const {
1140
- return is_full ? mem->size : mem->n;
1141
- }
1142
-
1143
- uint32_t llama_memory_recurrent_context::get_head() const {
1144
- return is_full ? 0 : mem->head;
1145
- }
1146
-
1147
- int32_t llama_memory_recurrent_context::get_rs_z() const {
1148
- return is_full ? 0 : mem->rs_z;
1149
- }
1150
-
1151
- uint32_t llama_memory_recurrent_context::get_size() const {
1152
- return mem->size;
1153
- }
1154
-
1155
- ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const {
1156
- return mem->r_l[il];
1157
- }
1158
-
1159
- ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
1160
- return mem->s_l[il];
1161
- }
1162
-
1163
- int32_t llama_memory_recurrent_context::s_copy(int i) const {
1164
- return mem->cells[i + mem->head].src0;
1165
- }