whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,1982 @@
1
+ #define GGML_COMMON_IMPL_CPP
2
+ #define GGML_COMMON_DECL_CPP
3
+ #include "ggml-common.h"
4
+ #include "ggml-backend-impl.h"
5
+
6
+ #include "ggml-impl.h"
7
+ #include "ggml-cpu.h"
8
+ #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
10
+ #include "traits.h"
11
+
12
+ #include "arch-fallback.h"
13
+
14
+ #include <cmath>
15
+ #include <cstring>
16
+ #include <cassert>
17
+ #include <cstdio> // for GGML_ASSERT
18
+
19
+ #include "repack.h"
20
+
21
+ #if defined(__GNUC__)
22
+ #pragma GCC diagnostic ignored "-Woverlength-strings"
23
+ #endif
24
+
25
+ #define UNUSED GGML_UNUSED
26
+
27
+ static inline int nearest_int(float fval) {
28
+ assert(fabsf(fval) <= 4194303.f);
29
+ float val = fval + 12582912.f;
30
+ int i; memcpy(&i, &val, sizeof(int));
31
+ return (i & 0x007fffff) - 0x00400000;
32
+ }
33
+
34
+ // Functions to create the interleaved data layout formats
35
+
36
+ // interleave 4 block_q4_0s in blocks of blck_size_interleave
37
+ // returns an interleaved block_q4_0x4
38
+ // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
39
+ // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
40
+ //
41
+ // - in : an array of block_q4_0 pointers
42
+ // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
43
+ // blck_size_interleave bytes
44
+ // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
45
+ // from bias offset form to pure sign form (this saves subtract
46
+ // operations durin unpacking)
47
+ //
48
+
49
+ extern "C" {
50
+
51
+ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
52
+ assert(QK8_0 == 32);
53
+ assert(k % QK8_0 == 0);
54
+ const int nb = k / QK8_0;
55
+
56
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
57
+
58
+ // scalar
59
+ const int blck_size_interleave = 4;
60
+ float srcv[4][QK8_0];
61
+ float id[4];
62
+
63
+ for (int i = 0; i < nb; i++) {
64
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
65
+ float amax = 0.0f; // absolute max
66
+
67
+ for (int j = 0; j < QK8_0; j++) {
68
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
69
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
70
+ }
71
+
72
+ const float d = amax / ((1 << 7) - 1);
73
+ id[row_iter] = d ? 1.0f / d : 0.0f;
74
+
75
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
76
+ }
77
+
78
+ for (int j = 0; j < QK8_0 * 4; j++) {
79
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
80
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
81
+ src_offset += (j % blck_size_interleave);
82
+
83
+ float x0 = srcv[src_id][src_offset] * id[src_id];
84
+ y[i].qs[j] = roundf(x0);
85
+ }
86
+ }
87
+ }
88
+
89
+ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
90
+ assert(QK8_0 == 32);
91
+ assert(k % QK8_0 == 0);
92
+ const int nb = k / QK8_0;
93
+
94
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
95
+
96
+ // scalar
97
+ const int blck_size_interleave = 8;
98
+ float srcv[4][QK8_0];
99
+ float id[4];
100
+
101
+ for (int i = 0; i < nb; i++) {
102
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
103
+ float amax = 0.0f; // absolute max
104
+
105
+ for (int j = 0; j < QK8_0; j++) {
106
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
107
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
108
+ }
109
+
110
+ const float d = amax / ((1 << 7) - 1);
111
+ id[row_iter] = d ? 1.0f / d : 0.0f;
112
+
113
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
114
+ }
115
+
116
+ for (int j = 0; j < QK8_0 * 4; j++) {
117
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
118
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
119
+ src_offset += (j % blck_size_interleave);
120
+
121
+ float x0 = srcv[src_id][src_offset] * id[src_id];
122
+ y[i].qs[j] = roundf(x0);
123
+ }
124
+ }
125
+ }
126
+
127
+ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
128
+ assert(QK_K == 256);
129
+ assert(k % QK_K == 0);
130
+ const int nb = k / QK_K;
131
+
132
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
133
+
134
+ // scalar
135
+ const int blck_size_interleave = 8;
136
+ float srcv[4][QK_K];
137
+ float iscale[4];
138
+
139
+ for (int i = 0; i < nb; i++) {
140
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
141
+ float amax = 0.0f; // absolute max
142
+ float max = 0;
143
+
144
+ for (int j = 0; j < QK_K; j++) {
145
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
146
+ // Update the maximum value of the corresponding super block
147
+ if(amax < fabsf(srcv[row_iter][j])) {
148
+ amax = fabsf(srcv[row_iter][j]);
149
+ max = srcv[row_iter][j];
150
+ }
151
+ }
152
+
153
+ iscale[row_iter] = amax ? -127.f/max : 0;
154
+
155
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
156
+ }
157
+
158
+ for (int j = 0; j < QK_K / 4; j++) {
159
+ y[i].bsums[j] = 0;
160
+ }
161
+
162
+ // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
163
+ // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
164
+ // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
165
+ for (int j = 0; j < QK_K * 4; j++) {
166
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
167
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
168
+ src_offset += (j % blck_size_interleave);
169
+ int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
170
+
171
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
172
+ y[i].qs[j] = nearest_int(x0);
173
+ y[i].bsums[index] += y[i].qs[j];
174
+ }
175
+ }
176
+ }
177
+
178
+ } // extern "C"
179
+
180
+ template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
181
+ void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
182
+
183
+ template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
184
+ assert(nrow == 4);
185
+ UNUSED(nrow);
186
+ ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
187
+ }
188
+
189
+ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
190
+ assert(nrow == 4);
191
+ UNUSED(nrow);
192
+ ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
193
+ }
194
+
195
+ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
196
+ assert(nrow == 4);
197
+ UNUSED(nrow);
198
+ ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
199
+ }
200
+
201
+ extern "C" {
202
+
203
+ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
204
+ const int qk = QK8_0;
205
+ const int nb = n / qk;
206
+ const int ncols_interleaved = 4;
207
+ const int blocklen = 4;
208
+
209
+ assert(nr == 1);
210
+ assert(n % qk == 0);
211
+ assert(nc % ncols_interleaved == 0);
212
+
213
+ UNUSED(s);
214
+ UNUSED(bs);
215
+ UNUSED(vx);
216
+ UNUSED(vy);
217
+ UNUSED(nr);
218
+ UNUSED(nc);
219
+ UNUSED(nb);
220
+ UNUSED(ncols_interleaved);
221
+ UNUSED(blocklen);
222
+
223
+ float sumf[4];
224
+ int sumi;
225
+
226
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
227
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
228
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
229
+
230
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
231
+ for (int l = 0; l < nb; l++) {
232
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
233
+ for (int j = 0; j < ncols_interleaved; j++) {
234
+ sumi = 0;
235
+ for (int i = 0; i < blocklen; ++i) {
236
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
237
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
238
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
239
+ }
240
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
241
+ }
242
+ }
243
+ }
244
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
245
+ }
246
+ }
247
+
248
+ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
249
+ const int qk = QK8_0;
250
+ const int nb = n / qk;
251
+ const int ncols_interleaved = 4;
252
+ const int blocklen = 8;
253
+
254
+ assert (n % qk == 0);
255
+ assert (nc % ncols_interleaved == 0);
256
+
257
+ UNUSED(s);
258
+ UNUSED(bs);
259
+ UNUSED(vx);
260
+ UNUSED(vy);
261
+ UNUSED(nr);
262
+ UNUSED(nc);
263
+ UNUSED(nb);
264
+ UNUSED(ncols_interleaved);
265
+ UNUSED(blocklen);
266
+
267
+ float sumf[4];
268
+ int sumi;
269
+
270
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
271
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
272
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
273
+
274
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
275
+ for (int l = 0; l < nb; l++) {
276
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
277
+ for (int j = 0; j < ncols_interleaved; j++) {
278
+ sumi = 0;
279
+ for (int i = 0; i < blocklen; ++i) {
280
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
281
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
282
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
283
+ }
284
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
285
+ }
286
+ }
287
+ }
288
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
289
+ }
290
+ }
291
+
292
+ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
293
+ const int qk = QK8_0;
294
+ const int nb = n / qk;
295
+ const int ncols_interleaved = 8;
296
+ const int blocklen = 8;
297
+
298
+ assert (n % qk == 0);
299
+ assert (nc % ncols_interleaved == 0);
300
+
301
+ UNUSED(s);
302
+ UNUSED(bs);
303
+ UNUSED(vx);
304
+ UNUSED(vy);
305
+ UNUSED(nr);
306
+ UNUSED(nc);
307
+ UNUSED(nb);
308
+ UNUSED(ncols_interleaved);
309
+ UNUSED(blocklen);
310
+
311
+ float sumf[8];
312
+ int sumi;
313
+
314
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
316
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
317
+
318
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
+ for (int l = 0; l < nb; l++) {
320
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
+ for (int j = 0; j < ncols_interleaved; j++) {
322
+ sumi = 0;
323
+ for (int i = 0; i < blocklen; ++i) {
324
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
327
+ }
328
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
329
+ }
330
+ }
331
+ }
332
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
333
+ }
334
+ }
335
+
336
+ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
337
+ const int qk = QK_K;
338
+ const int nb = n / qk;
339
+ const int ncols_interleaved = 8;
340
+ const int blocklen = 8;
341
+ static const uint32_t kmask1 = 0x3f3f3f3f;
342
+ static const uint32_t kmask2 = 0x0f0f0f0f;
343
+ static const uint32_t kmask3 = 0x03030303;
344
+
345
+ assert (n % qk == 0);
346
+ assert (nc % ncols_interleaved == 0);
347
+
348
+ UNUSED(s);
349
+ UNUSED(bs);
350
+ UNUSED(vx);
351
+ UNUSED(vy);
352
+ UNUSED(nr);
353
+ UNUSED(nc);
354
+ UNUSED(nb);
355
+ UNUSED(ncols_interleaved);
356
+ UNUSED(blocklen);
357
+
358
+ float sumf[8];
359
+ float sum_minf[8];
360
+ uint32_t utmp[32];
361
+ int sumi1;
362
+ int sumi2;
363
+ int sumi;
364
+
365
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
366
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
367
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
368
+
369
+ for (int j = 0; j < ncols_interleaved; j++) {
370
+ sumf[j] = 0.0;
371
+ sum_minf[j] = 0.0;
372
+ }
373
+ for (int l = 0; l < nb; l++) {
374
+ for (int sb = 0; sb < 8; sb++) {
375
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
376
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
377
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
378
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
379
+ utmp[sb * 4 + 2] = uaux_0;
380
+ utmp[sb * 4 + 0] &= kmask1;
381
+ }
382
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
383
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
384
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
385
+ for (int j = 0; j < ncols_interleaved; j++) {
386
+ sumi1 = 0;
387
+ sumi2 = 0;
388
+ sumi = 0;
389
+ for (int i = 0; i < blocklen; ++i) {
390
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
391
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
392
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
393
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
394
+ sumi1 = sumi1 * scales_0[j];
395
+ sumi2 = sumi2 * scales_1[j];
396
+ sumi += sumi1 + sumi2;
397
+ }
398
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
399
+ }
400
+ }
401
+ for (int sb = 0; sb < 8; sb++) {
402
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
403
+ for (int j = 0; j < ncols_interleaved; j++) {
404
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
405
+ }
406
+ }
407
+ }
408
+ for (int j = 0; j < ncols_interleaved; j++) {
409
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
410
+ }
411
+ }
412
+ }
413
+
414
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
415
+ const int qk = QK_K;
416
+ const int nb = n / qk;
417
+ const int ncols_interleaved = 8;
418
+ const int blocklen = 8;
419
+
420
+ assert (n % qk == 0);
421
+ assert (nc % ncols_interleaved == 0);
422
+
423
+ UNUSED(s);
424
+ UNUSED(bs);
425
+ UNUSED(vx);
426
+ UNUSED(vy);
427
+ UNUSED(nr);
428
+ UNUSED(nc);
429
+ UNUSED(nb);
430
+ UNUSED(ncols_interleaved);
431
+ UNUSED(blocklen);
432
+
433
+ float sumf[8];
434
+ float sum_minf[8];
435
+ int sumi1,sumi2,sumi3,sumi4;
436
+ int sumi;
437
+
438
+ const block_q8_K * a_ptr = (const block_q8_K *)vy;
439
+ for(int x = 0; x < nc / ncols_interleaved; x++) {
440
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
441
+ for (int j = 0; j < ncols_interleaved; j++) {
442
+ sumf[j] = 0.0;
443
+ sum_minf[j] = 0.0;
444
+ }
445
+ for (int l = 0; l < nb; l++) {
446
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
447
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
448
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
449
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
450
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
451
+ for (int j = 0; j < ncols_interleaved; j++) {
452
+ sumi1 = 0;
453
+ sumi2 = 0;
454
+ sumi3 = 0;
455
+ sumi4 = 0;
456
+ sumi = 0;
457
+ int offset = ((k / 2) % 2) + j * 2;
458
+ for (int i = 0; i < blocklen; ++i){
459
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
460
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
461
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
462
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
463
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
464
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
465
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
466
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
467
+
468
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
469
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
470
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
471
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
472
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
473
+ }
474
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
475
+ }
476
+ }
477
+ for(int sb = 0; sb < 8; sb++) {
478
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
479
+ for(int j = 0; j < ncols_interleaved; j++){
480
+ sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
481
+ }
482
+ }
483
+ }
484
+ for (int j = 0; j < ncols_interleaved; j++) {
485
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
486
+ }
487
+ }
488
+ }
489
+
490
+ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
491
+ const int qk = QK8_0;
492
+ const int nb = n / qk;
493
+ const int ncols_interleaved = 4;
494
+ const int blocklen = 4;
495
+
496
+ assert(nr == 1);
497
+ assert(n % qk == 0);
498
+ assert(nc % ncols_interleaved == 0);
499
+
500
+ UNUSED(bs);
501
+ UNUSED(nr);
502
+
503
+ float sumf[4];
504
+ int sumi;
505
+
506
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
507
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
508
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
509
+
510
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
511
+ for (int l = 0; l < nb; l++) {
512
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
513
+ for (int j = 0; j < ncols_interleaved; j++) {
514
+ sumi = 0;
515
+ for (int i = 0; i < blocklen; ++i) {
516
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
517
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
518
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
519
+ }
520
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
521
+ }
522
+ }
523
+ }
524
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
525
+ }
526
+ }
527
+
528
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
529
+ const int qk = QK8_0;
530
+ const int nb = n / qk;
531
+ const int ncols_interleaved = 8;
532
+ const int blocklen = 8;
533
+
534
+ assert(nr == 1);
535
+ assert(n % qk == 0);
536
+ assert(nc % ncols_interleaved == 0);
537
+
538
+ UNUSED(bs);
539
+ UNUSED(nr);
540
+
541
+ float sumf[8];
542
+ int sumi;
543
+
544
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
545
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
546
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
547
+
548
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
549
+ for (int l = 0; l < nb; l++) {
550
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
551
+ for (int j = 0; j < ncols_interleaved; j++) {
552
+ sumi = 0;
553
+ for (int i = 0; i < blocklen; ++i) {
554
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
555
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
556
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
557
+ }
558
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
559
+ }
560
+ }
561
+ }
562
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
563
+ }
564
+ }
565
+
566
+ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
567
+ const int qk = QK8_0;
568
+ const int nb = n / qk;
569
+ const int ncols_interleaved = 4;
570
+ const int blocklen = 4;
571
+
572
+ assert (n % qk == 0);
573
+ assert (nr % 4 == 0);
574
+ assert (nc % ncols_interleaved == 0);
575
+
576
+ UNUSED(s);
577
+ UNUSED(bs);
578
+ UNUSED(vx);
579
+ UNUSED(vy);
580
+ UNUSED(nr);
581
+ UNUSED(nc);
582
+ UNUSED(nb);
583
+ UNUSED(ncols_interleaved);
584
+ UNUSED(blocklen);
585
+
586
+ {
587
+ float sumf[4][4];
588
+ int sumi;
589
+
590
+ for (int y = 0; y < nr / 4; y++) {
591
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
592
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
593
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
594
+ for (int m = 0; m < 4; m++) {
595
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
596
+ }
597
+ for (int l = 0; l < nb; l++) {
598
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
599
+ for (int m = 0; m < 4; m++) {
600
+ for (int j = 0; j < ncols_interleaved; j++) {
601
+ sumi = 0;
602
+ for (int i = 0; i < blocklen; ++i) {
603
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
604
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
605
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
606
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
607
+ }
608
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
609
+ }
610
+ }
611
+ }
612
+ }
613
+ for (int m = 0; m < 4; m++) {
614
+ for (int j = 0; j < ncols_interleaved; j++)
615
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
616
+ }
617
+ }
618
+ }
619
+ }
620
+ }
621
+
622
+ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
623
+ const int qk = QK8_0;
624
+ const int nb = n / qk;
625
+ const int ncols_interleaved = 4;
626
+ const int blocklen = 8;
627
+
628
+ assert (n % qk == 0);
629
+ assert (nr % 4 == 0);
630
+ assert (nc % ncols_interleaved == 0);
631
+
632
+ UNUSED(s);
633
+ UNUSED(bs);
634
+ UNUSED(vx);
635
+ UNUSED(vy);
636
+ UNUSED(nr);
637
+ UNUSED(nc);
638
+ UNUSED(nb);
639
+ UNUSED(ncols_interleaved);
640
+ UNUSED(blocklen);
641
+
642
+ float sumf[4][4];
643
+ int sumi;
644
+
645
+ for (int y = 0; y < nr / 4; y++) {
646
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
647
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
648
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
649
+ for (int m = 0; m < 4; m++) {
650
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
651
+ }
652
+ for (int l = 0; l < nb; l++) {
653
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
654
+ for (int m = 0; m < 4; m++) {
655
+ for (int j = 0; j < ncols_interleaved; j++) {
656
+ sumi = 0;
657
+ for (int i = 0; i < blocklen; ++i) {
658
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
659
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
660
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
661
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
662
+ }
663
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
664
+ }
665
+ }
666
+ }
667
+ }
668
+ for (int m = 0; m < 4; m++) {
669
+ for (int j = 0; j < ncols_interleaved; j++)
670
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
671
+ }
672
+ }
673
+ }
674
+ }
675
+
676
+ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
677
+ const int qk = QK8_0;
678
+ const int nb = n / qk;
679
+ const int ncols_interleaved = 8;
680
+ const int blocklen = 8;
681
+
682
+ assert (n % qk == 0);
683
+ assert (nr % 4 == 0);
684
+ assert (nc % ncols_interleaved == 0);
685
+
686
+ UNUSED(s);
687
+ UNUSED(bs);
688
+ UNUSED(vx);
689
+ UNUSED(vy);
690
+ UNUSED(nr);
691
+ UNUSED(nc);
692
+ UNUSED(nb);
693
+ UNUSED(ncols_interleaved);
694
+ UNUSED(blocklen);
695
+
696
+ float sumf[4][8];
697
+ int sumi;
698
+
699
+ for (int y = 0; y < nr / 4; y++) {
700
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
701
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
702
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
703
+ for (int m = 0; m < 4; m++) {
704
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
705
+ }
706
+ for (int l = 0; l < nb; l++) {
707
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
708
+ for (int m = 0; m < 4; m++) {
709
+ for (int j = 0; j < ncols_interleaved; j++) {
710
+ sumi = 0;
711
+ for (int i = 0; i < blocklen; ++i) {
712
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
713
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
714
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
715
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
716
+ }
717
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
718
+ }
719
+ }
720
+ }
721
+ }
722
+ for (int m = 0; m < 4; m++) {
723
+ for (int j = 0; j < ncols_interleaved; j++)
724
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
725
+ }
726
+ }
727
+ }
728
+ }
729
+
730
+ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
731
+ const int qk = QK_K;
732
+ const int nb = n / qk;
733
+ const int ncols_interleaved = 8;
734
+ const int blocklen = 8;
735
+ static const uint32_t kmask1 = 0x3f3f3f3f;
736
+ static const uint32_t kmask2 = 0x0f0f0f0f;
737
+ static const uint32_t kmask3 = 0x03030303;
738
+
739
+ assert (n % qk == 0);
740
+ assert (nr % 4 == 0);
741
+ assert (nc % ncols_interleaved == 0);
742
+
743
+ UNUSED(s);
744
+ UNUSED(bs);
745
+ UNUSED(vx);
746
+ UNUSED(vy);
747
+ UNUSED(nr);
748
+ UNUSED(nc);
749
+ UNUSED(nb);
750
+ UNUSED(ncols_interleaved);
751
+ UNUSED(blocklen);
752
+
753
+ float sumf[4][8];
754
+ float sum_minf[4][8];
755
+ uint32_t utmp[32];
756
+ int sumi1;
757
+ int sumi2;
758
+ int sumi;
759
+
760
+ for (int y = 0; y < nr / 4; y++) {
761
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
762
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
763
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
764
+ for (int m = 0; m < 4; m++) {
765
+ for (int j = 0; j < ncols_interleaved; j++) {
766
+ sumf[m][j] = 0.0;
767
+ sum_minf[m][j] = 0.0;
768
+ }
769
+ }
770
+ for (int l = 0; l < nb; l++) {
771
+ for (int sb = 0; sb < 8; sb++) {
772
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
773
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
774
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
775
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
776
+ utmp[sb * 4 + 2] = uaux_0;
777
+ utmp[sb * 4 + 0] &= kmask1;
778
+ }
779
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
780
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
781
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
782
+ for (int m = 0; m < 4; m++) {
783
+ for (int j = 0; j < ncols_interleaved; j++) {
784
+ sumi1 = 0;
785
+ sumi2 = 0;
786
+ sumi = 0;
787
+ for (int i = 0; i < blocklen; ++i) {
788
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
789
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
790
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
791
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
792
+ sumi1 = sumi1 * scales_0[j];
793
+ sumi2 = sumi2 * scales_1[j];
794
+ sumi += sumi1 + sumi2;
795
+ }
796
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
797
+ }
798
+ }
799
+ }
800
+ for (int sb = 0; sb < 8; sb++) {
801
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
802
+ for(int m = 0; m < 4; m++) {
803
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
804
+ for(int j = 0; j < ncols_interleaved; j++) {
805
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
806
+ }
807
+ }
808
+ }
809
+ }
810
+ for (int m = 0; m < 4; m++) {
811
+ for (int j = 0; j < ncols_interleaved; j++) {
812
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
813
+ }
814
+ }
815
+ }
816
+ }
817
+ }
818
+
819
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
820
+ const int qk = QK_K;
821
+ const int nb = n / qk;
822
+ const int ncols_interleaved = 8;
823
+ const int blocklen = 8;
824
+
825
+ assert (n % qk == 0);
826
+ assert (nr % 4 == 0);
827
+ assert (nc % ncols_interleaved == 0);
828
+
829
+ UNUSED(s);
830
+ UNUSED(bs);
831
+ UNUSED(vx);
832
+ UNUSED(vy);
833
+ UNUSED(nr);
834
+ UNUSED(nc);
835
+ UNUSED(nb);
836
+ UNUSED(ncols_interleaved);
837
+ UNUSED(blocklen);
838
+
839
+ float sumf[4][8];
840
+ float sum_minf[4][8];
841
+ int sumi1, sumi2, sumi3, sumi4;
842
+ int sumi;
843
+
844
+ for (int y = 0; y < nr / 4; y++) {
845
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
846
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
847
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
848
+ for (int m = 0; m < 4; m++) {
849
+ for (int j = 0; j < ncols_interleaved; j++) {
850
+ sumf[m][j] = 0.0;
851
+ sum_minf[m][j] = 0.0;
852
+ }
853
+ }
854
+ for (int l = 0; l < nb; l++) {
855
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
856
+
857
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
858
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
859
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
860
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
861
+ for (int m = 0; m < 4; m++) {
862
+ for (int j = 0; j < ncols_interleaved; j++) {
863
+ sumi1 = 0;
864
+ sumi2 = 0;
865
+ sumi3 = 0;
866
+ sumi4 = 0;
867
+ sumi = 0;
868
+ int offset = ((k / 2) % 2) + j * 2;
869
+ for (int i = 0; i < blocklen; ++i){
870
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
871
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
872
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
873
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
874
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
875
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
876
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
877
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
878
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
879
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
880
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
881
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
882
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
883
+ }
884
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
885
+ }
886
+ }
887
+ }
888
+ for(int sb = 0; sb < 8; sb++) {
889
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
890
+ for(int m = 0; m < 4; m++) {
891
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
892
+ for(int j = 0; j < ncols_interleaved; j++) {
893
+ int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
894
+ sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
895
+ }
896
+ }
897
+ }
898
+ }
899
+
900
+ for (int m = 0; m < 4; m++) {
901
+ for (int j = 0; j < ncols_interleaved; j++) {
902
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
903
+ }
904
+ }
905
+ }
906
+ }
907
+ }
908
+
909
+
910
+ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
911
+ const int qk = QK8_0;
912
+ const int nb = n / qk;
913
+ const int ncols_interleaved = 4;
914
+ const int blocklen = 4;
915
+
916
+ assert (n % qk == 0);
917
+ assert (nr % 4 == 0);
918
+ assert (nc % ncols_interleaved == 0);
919
+
920
+ UNUSED(s);
921
+ UNUSED(bs);
922
+ UNUSED(vx);
923
+ UNUSED(vy);
924
+ UNUSED(nr);
925
+ UNUSED(nc);
926
+ UNUSED(nb);
927
+ UNUSED(ncols_interleaved);
928
+ UNUSED(blocklen);
929
+
930
+ {
931
+ float sumf[4][4];
932
+ int sumi;
933
+
934
+ for (int y = 0; y < nr / 4; y++) {
935
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
936
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
937
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
938
+ for (int m = 0; m < 4; m++) {
939
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
940
+ }
941
+ for (int l = 0; l < nb; l++) {
942
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
943
+ for (int m = 0; m < 4; m++) {
944
+ for (int j = 0; j < ncols_interleaved; j++) {
945
+ sumi = 0;
946
+ for (int i = 0; i < blocklen; ++i) {
947
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
948
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
949
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
950
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
951
+ }
952
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
953
+ }
954
+ }
955
+ }
956
+ }
957
+ for (int m = 0; m < 4; m++) {
958
+ for (int j = 0; j < ncols_interleaved; j++)
959
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
960
+ }
961
+ }
962
+ }
963
+ }
964
+ }
965
+
966
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
967
+ const int qk = QK8_0;
968
+ const int nb = n / qk;
969
+ const int ncols_interleaved = 8;
970
+ const int blocklen = 8;
971
+
972
+ assert(n % qk == 0);
973
+ assert(nr % 4 == 0);
974
+ assert(nc % ncols_interleaved == 0);
975
+
976
+ float sumf[4][8];
977
+ int sumi;
978
+
979
+ for (int y = 0; y < nr / 4; y++) {
980
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
983
+ for (int m = 0; m < 4; m++) {
984
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
985
+ }
986
+ for (int l = 0; l < nb; l++) {
987
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
988
+ for (int m = 0; m < 4; m++) {
989
+ for (int j = 0; j < ncols_interleaved; j++) {
990
+ sumi = 0;
991
+ for (int i = 0; i < blocklen; ++i) {
992
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
993
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
994
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
995
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
996
+ }
997
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
998
+ }
999
+ }
1000
+ }
1001
+ }
1002
+ for (int m = 0; m < 4; m++) {
1003
+ for (int j = 0; j < ncols_interleaved; j++)
1004
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1005
+ }
1006
+ }
1007
+ }
1008
+ }
1009
+
1010
+ } // extern "C"
1011
+
1012
+ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
1013
+ block_q4_0x4 out;
1014
+
1015
+ for (int i = 0; i < 4; i++) {
1016
+ out.d[i] = in[i].d;
1017
+ }
1018
+
1019
+ const int end = QK4_0 * 2 / blck_size_interleave;
1020
+
1021
+ if (blck_size_interleave == 8) {
1022
+ const uint64_t xor_mask = 0x8888888888888888ULL;
1023
+ for (int i = 0; i < end; ++i) {
1024
+ int src_id = i % 4;
1025
+ int src_offset = (i / 4) * blck_size_interleave;
1026
+ int dst_offset = i * blck_size_interleave;
1027
+
1028
+ uint64_t elems;
1029
+ // Using memcpy to avoid unaligned memory accesses
1030
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1031
+ elems ^= xor_mask;
1032
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1033
+ }
1034
+ } else if (blck_size_interleave == 4) {
1035
+ const uint32_t xor_mask = 0x88888888;
1036
+ for (int i = 0; i < end; ++i) {
1037
+ int src_id = i % 4;
1038
+ int src_offset = (i / 4) * blck_size_interleave;
1039
+ int dst_offset = i * blck_size_interleave;
1040
+
1041
+ uint32_t elems;
1042
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
1043
+ elems ^= xor_mask;
1044
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
1045
+ }
1046
+ } else {
1047
+ GGML_ASSERT(false);
1048
+ }
1049
+
1050
+ return out;
1051
+ }
1052
+
1053
+ // interleave 8 block_q4_0s in blocks of blck_size_interleave
1054
+ // returns an interleaved block_q4_0x8
1055
+ // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
1056
+ // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
1057
+ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
1058
+ block_q4_0x8 out;
1059
+
1060
+ for (int i = 0; i < 8; i++) {
1061
+ out.d[i] = in[i].d;
1062
+ }
1063
+
1064
+ const int end = QK4_0 * 4 / blck_size_interleave;
1065
+ const uint64_t xor_mask = 0x8888888888888888ULL;
1066
+
1067
+ for (int i = 0; i < end; ++i) {
1068
+ int src_id = i % 8;
1069
+ int src_offset = (i / 8) * blck_size_interleave;
1070
+ int dst_offset = i * blck_size_interleave;
1071
+
1072
+ uint64_t elems;
1073
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1074
+ elems ^= xor_mask;
1075
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1076
+ }
1077
+
1078
+ return out;
1079
+ }
1080
+
1081
+ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
1082
+ block_q4_Kx8 out;
1083
+ //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
1084
+ for (int i = 0; i < 8; i++) {
1085
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1086
+ }
1087
+
1088
+ for (int i = 0; i < 8; i++) {
1089
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1090
+ }
1091
+
1092
+ const int end = QK_K * 4 / blck_size_interleave;
1093
+
1094
+ // Interleave Q4_K quants by taking 8 bytes at a time
1095
+ for (int i = 0; i < end; ++i) {
1096
+ int src_id = i % 8;
1097
+ int src_offset = (i / 8) * blck_size_interleave;
1098
+ int dst_offset = i * blck_size_interleave;
1099
+
1100
+ uint64_t elems;
1101
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1102
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1103
+ }
1104
+
1105
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
1106
+ // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
1107
+ // The output Q4_Kx8 structure has 96 bytes
1108
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
1109
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
1110
+ uint8_t s[8], m[8];
1111
+
1112
+ for (int i = 0; i < 4; i++) {
1113
+ for (int j = 0; j < 8; j++) {
1114
+ s[j] = in[j].scales[i] & 63;
1115
+ m[j] = in[j].scales[i + 4] & 63;
1116
+ }
1117
+
1118
+ out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
1119
+ out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
1120
+ out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
1121
+ out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
1122
+ out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
1123
+ out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
1124
+ out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
1125
+ out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
1126
+ out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
1127
+ out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
1128
+ out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
1129
+ out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
1130
+
1131
+ }
1132
+
1133
+ for (int i = 0; i < 4; i++) {
1134
+ for (int j = 0; j < 8; j++) {
1135
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
1136
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
1137
+ }
1138
+
1139
+ out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
1140
+ out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
1141
+ out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
1142
+ out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
1143
+ out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
1144
+ out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
1145
+ out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
1146
+ out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
1147
+ out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
1148
+ out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
1149
+ out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
1150
+ out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
1151
+
1152
+ }
1153
+
1154
+ return out;
1155
+ }
1156
+
1157
+ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
1158
+ block_q2_Kx8 out;
1159
+
1160
+ // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
1161
+ for (int i = 0; i < 8; i++) {
1162
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1163
+ }
1164
+
1165
+ for (int i = 0; i < 8; i++) {
1166
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1167
+ }
1168
+
1169
+ const int end = QK_K * 2 / blck_size_interleave;
1170
+
1171
+ // Interleave Q2_K quants by taking 8 bytes at a time
1172
+ for (int i = 0; i < end; ++i) {
1173
+ int src_id = i % 8;
1174
+ int src_offset = (i / 8) * blck_size_interleave;
1175
+ int dst_offset = i * blck_size_interleave;
1176
+
1177
+ uint64_t elems;
1178
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1179
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1180
+ }
1181
+
1182
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
1183
+ // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
1184
+ // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
1185
+ // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
1186
+ // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
1187
+
1188
+ for(int i = 0; i < 128; i++){
1189
+
1190
+ // Index for selecting which q2k super block
1191
+ int src1 = (i % 16) / 2;
1192
+ // Index for selecting scale
1193
+ int src2 = ((i / 16) * 2) + (i % 2);
1194
+
1195
+ out.scales[i] = in[src1].scales[src2];
1196
+ }
1197
+ return out;
1198
+
1199
+ }
1200
+
1201
+ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1202
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
1203
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1204
+ constexpr int nrows_interleaved = 4;
1205
+
1206
+ block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
1207
+ const block_q4_0 * src = (const block_q4_0 *)data;
1208
+ block_q4_0 dst_tmp[4];
1209
+ int nrow = ggml_nrows(t);
1210
+ int nblocks = t->ne[0] / QK4_0;
1211
+
1212
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
1213
+
1214
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1215
+ return -1;
1216
+ }
1217
+
1218
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1219
+ for (int64_t x = 0; x < nblocks; x++) {
1220
+ for (int i = 0; i < nrows_interleaved; i++) {
1221
+ dst_tmp[i] = src[x + i * nblocks];
1222
+ }
1223
+ *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
1224
+ }
1225
+ src += nrows_interleaved * nblocks;
1226
+ }
1227
+ return 0;
1228
+
1229
+ GGML_UNUSED(data_size);
1230
+ }
1231
+ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1232
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
1233
+ GGML_ASSERT(interleave_block == 8);
1234
+ constexpr int nrows_interleaved = 8;
1235
+
1236
+ block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
1237
+ const block_q4_K * src = (const block_q4_K*) data;
1238
+ block_q4_K dst_tmp[8];
1239
+ int nrow = ggml_nrows(t);
1240
+ int nblocks = t->ne[0] / QK_K;
1241
+
1242
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
1243
+
1244
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1245
+ return -1;
1246
+ }
1247
+
1248
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1249
+ for (int64_t x = 0; x < nblocks; x++) {
1250
+ for (int i = 0; i < nrows_interleaved; i++ ) {
1251
+ dst_tmp[i] = src[x + i * nblocks];
1252
+ }
1253
+ *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
1254
+ }
1255
+ src += nrows_interleaved * nblocks;
1256
+ }
1257
+ return 0;
1258
+
1259
+ GGML_UNUSED(data_size);
1260
+ }
1261
+
1262
+ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1263
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
1264
+ GGML_ASSERT(interleave_block == 8);
1265
+ constexpr int nrows_interleaved = 8;
1266
+
1267
+ block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
1268
+ const block_q2_K * src = (const block_q2_K*) data;
1269
+ block_q2_K dst_tmp[8];
1270
+ int nrow = ggml_nrows(t);
1271
+ int nblocks = t->ne[0] / QK_K;
1272
+
1273
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
1274
+
1275
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1276
+ return -1;
1277
+ }
1278
+
1279
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1280
+ for (int64_t x = 0; x < nblocks; x++) {
1281
+ for (int i = 0; i < nrows_interleaved; i++ ) {
1282
+ dst_tmp[i] = src[x + i * nblocks];
1283
+ }
1284
+ *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
1285
+ }
1286
+ src += nrows_interleaved * nblocks;
1287
+ }
1288
+ return 0;
1289
+
1290
+ GGML_UNUSED(data_size);
1291
+ }
1292
+
1293
+ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1294
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
1295
+ GGML_ASSERT(interleave_block == 8);
1296
+ constexpr int nrows_interleaved = 8;
1297
+
1298
+ block_q4_0x8 * dst = (block_q4_0x8*)t->data;
1299
+ const block_q4_0 * src = (const block_q4_0*) data;
1300
+ block_q4_0 dst_tmp[8];
1301
+ int nrow = ggml_nrows(t);
1302
+ int nblocks = t->ne[0] / QK4_0;
1303
+
1304
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
1305
+
1306
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1307
+ return -1;
1308
+ }
1309
+
1310
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1311
+ for (int64_t x = 0; x < nblocks; x++) {
1312
+ for (int i = 0; i < nrows_interleaved; i++ ) {
1313
+ dst_tmp[i] = src[x + i * nblocks];
1314
+ }
1315
+ *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
1316
+ }
1317
+ src += nrows_interleaved * nblocks;
1318
+ }
1319
+ return 0;
1320
+
1321
+ GGML_UNUSED(data_size);
1322
+ }
1323
+
1324
+ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
1325
+ block_iq4_nlx4 out;
1326
+
1327
+ for (int i = 0; i < 4; i++) {
1328
+ out.d[i] = in[i].d;
1329
+ }
1330
+
1331
+ const int end = QK4_NL * 2 / blck_size_interleave;
1332
+
1333
+ // TODO: this branch seems wrong
1334
+ //if (blck_size_interleave == 8) {
1335
+ // for (int i = 0; i < end; ++i) {
1336
+ // int src_id = i % 4;
1337
+ // int src_offset = (i / 4) * blck_size_interleave;
1338
+ // int dst_offset = i * blck_size_interleave;
1339
+
1340
+ // // Using memcpy to avoid unaligned memory accesses
1341
+ // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1342
+ // }
1343
+ //} else
1344
+ if (blck_size_interleave == 4) {
1345
+ for (int i = 0; i < end; ++i) {
1346
+ int src_id = i % 4;
1347
+ int src_offset = (i / 4) * blck_size_interleave;
1348
+ int dst_offset = i * blck_size_interleave;
1349
+
1350
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
1351
+ }
1352
+ } else {
1353
+ GGML_ASSERT(false);
1354
+ }
1355
+
1356
+ return out;
1357
+ }
1358
+
1359
+ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1360
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1361
+ GGML_ASSERT(interleave_block == 4);
1362
+
1363
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1364
+ block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
1365
+
1366
+ block_iq4_nl dst_tmp[4];
1367
+
1368
+ int nrow = ggml_nrows(t);
1369
+ int nrows_interleaved = 4;
1370
+ int nblocks = t->ne[0] / QK4_NL;
1371
+
1372
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1373
+
1374
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1375
+ return -1;
1376
+ }
1377
+
1378
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1379
+ for (int64_t x = 0; x < nblocks; x++) {
1380
+ for (int i = 0; i < nrows_interleaved; i++) {
1381
+ dst_tmp[i] = src[x + i * nblocks];
1382
+ }
1383
+ *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
1384
+ }
1385
+ src += nrows_interleaved * nblocks;
1386
+ }
1387
+ return 0;
1388
+
1389
+ GGML_UNUSED(data_size);
1390
+ }
1391
+
1392
+ static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
1393
+ block_iq4_nlx8 out;
1394
+
1395
+ for (int i = 0; i < 8; i++) {
1396
+ out.d[i] = in[i].d;
1397
+ }
1398
+
1399
+ const int end = QK4_NL * 4 / blck_size_interleave;
1400
+
1401
+ if (blck_size_interleave == 8) {
1402
+ for (int i = 0; i < end; ++i) {
1403
+ int src_id = i % 8;
1404
+ int src_offset = (i / 8) * blck_size_interleave;
1405
+ int dst_offset = i * blck_size_interleave;
1406
+
1407
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1408
+ }
1409
+ } else {
1410
+ GGML_ASSERT(false);
1411
+ }
1412
+
1413
+ return out;
1414
+ }
1415
+
1416
+ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1417
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1418
+ GGML_ASSERT(interleave_block == 8);
1419
+
1420
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1421
+ block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
1422
+
1423
+ block_iq4_nl dst_tmp[8];
1424
+
1425
+ int nrow = ggml_nrows(t);
1426
+ int nrows_interleaved = 8;
1427
+ int nblocks = t->ne[0] / QK4_NL;
1428
+
1429
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1430
+
1431
+ if (t->ne[1] % nrows_interleaved != 0) {
1432
+ return -1;
1433
+ }
1434
+
1435
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1436
+ for (int64_t x = 0; x < nblocks; x++) {
1437
+ for (int i = 0; i < nrows_interleaved; i++) {
1438
+ dst_tmp[i] = src[x + i * nblocks];
1439
+ }
1440
+ *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
1441
+ }
1442
+ src += nrows_interleaved * nblocks;
1443
+ }
1444
+ return 0;
1445
+
1446
+ GGML_UNUSED(data_size);
1447
+ }
1448
+
1449
+ namespace ggml::cpu::repack {
1450
+ // repack
1451
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
1452
+ int repack(struct ggml_tensor *, const void *, size_t);
1453
+
1454
+ // TODO: generalise.
1455
+ template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1456
+ return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
1457
+ }
1458
+
1459
+ template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1460
+ return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
1461
+ }
1462
+
1463
+ template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1464
+ return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
1465
+ }
1466
+
1467
+ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1468
+ return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
1469
+ }
1470
+
1471
+ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1472
+ return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
1473
+ }
1474
+
1475
+ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1476
+ return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
1477
+ }
1478
+
1479
+ // TODO: needs to be revisited
1480
+ //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1481
+ // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1482
+ //}
1483
+
1484
+ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1485
+ return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1486
+ }
1487
+
1488
+ // gemv
1489
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1490
+ void gemv(int, float *, size_t, const void *, const void *, int, int);
1491
+
1492
+ template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1493
+ ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1494
+ }
1495
+
1496
+ template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1497
+ ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
1498
+ }
1499
+
1500
+ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1501
+ ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1502
+ }
1503
+
1504
+ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1505
+ ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1506
+ }
1507
+
1508
+ template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1509
+ ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1510
+ }
1511
+
1512
+ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1513
+ ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1514
+ }
1515
+
1516
+ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1517
+ ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1518
+ }
1519
+
1520
+ // gemm
1521
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1522
+ void gemm(int, float *, size_t, const void *, const void *, int, int);
1523
+
1524
+ template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1525
+ ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1526
+ }
1527
+
1528
+ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1529
+ ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
1530
+ }
1531
+
1532
+ template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1533
+ ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1534
+ }
1535
+
1536
+ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1537
+ ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1538
+ }
1539
+
1540
+ template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1541
+ ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1542
+ }
1543
+
1544
+ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1545
+ ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1546
+ }
1547
+
1548
+ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1549
+ ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1550
+ }
1551
+
1552
+ class tensor_traits_base : public ggml::cpu::tensor_traits {
1553
+ public:
1554
+ virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
1555
+ };
1556
+
1557
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
1558
+
1559
+ bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
1560
+ // not realy a GGML_TYPE_Q8_0 but same size.
1561
+ switch (op->op) {
1562
+ case GGML_OP_MUL_MAT:
1563
+ {
1564
+ size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1565
+ return true;
1566
+ }
1567
+ case GGML_OP_MUL_MAT_ID:
1568
+ {
1569
+ size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1570
+ size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
1571
+
1572
+ const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
1573
+ const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
1574
+
1575
+ const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
1576
+
1577
+ size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
1578
+
1579
+ return true;
1580
+ }
1581
+ default:
1582
+ // GGML_ABORT("fatal error");
1583
+ break;
1584
+ }
1585
+ return false;
1586
+ }
1587
+
1588
+ bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
1589
+ switch (op->op) {
1590
+ case GGML_OP_MUL_MAT:
1591
+ forward_mul_mat(params, op);
1592
+ return true;
1593
+ case GGML_OP_MUL_MAT_ID:
1594
+ forward_mul_mat_id(params, op);
1595
+ return true;
1596
+ default:
1597
+ // GGML_ABORT("fatal error");
1598
+ break;
1599
+ }
1600
+ return false;
1601
+ }
1602
+
1603
+ void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
1604
+ const ggml_tensor * src0 = op->src[0];
1605
+ const ggml_tensor * src1 = op->src[1];
1606
+ ggml_tensor * dst = op;
1607
+
1608
+ GGML_TENSOR_BINARY_OP_LOCALS
1609
+
1610
+ const int ith = params->ith;
1611
+ const int nth = params->nth;
1612
+
1613
+ GGML_ASSERT(ne0 == ne01);
1614
+ GGML_ASSERT(ne1 == ne11);
1615
+ GGML_ASSERT(ne2 == ne12);
1616
+ GGML_ASSERT(ne3 == ne13);
1617
+
1618
+ // dst cannot be transposed or permuted
1619
+ GGML_ASSERT(nb0 == sizeof(float));
1620
+ GGML_ASSERT(nb0 <= nb1);
1621
+ GGML_ASSERT(nb1 <= nb2);
1622
+ GGML_ASSERT(nb2 <= nb3);
1623
+
1624
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
1625
+
1626
+ GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
1627
+ // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
1628
+
1629
+ char * wdata = static_cast<char *>(params->wdata);
1630
+ const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
1631
+
1632
+ assert(params->wsize >= nbw1 * ne11);
1633
+
1634
+ const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
1635
+
1636
+ int64_t i11_processed = 0;
1637
+ for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
1638
+ ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
1639
+ }
1640
+
1641
+ i11_processed = ne11 - ne11 % 4;
1642
+ for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
1643
+ from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
1644
+ }
1645
+
1646
+ ggml_barrier(params->threadpool);
1647
+
1648
+ const void * src1_wdata = params->wdata;
1649
+ const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
1650
+ int64_t src0_start = (ith * ne01) / nth;
1651
+ int64_t src0_end = ((ith + 1) * ne01) / nth;
1652
+ src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
1653
+ src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
1654
+ if (src0_start >= src0_end) {
1655
+ return;
1656
+ }
1657
+
1658
+ // If there are more than three rows in src1, use gemm; otherwise, use gemv.
1659
+ if (ne11 > 3) {
1660
+ gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1661
+ (float *) ((char *) dst->data) + src0_start, ne01,
1662
+ (const char *) src0->data + src0_start * nb01,
1663
+ (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
1664
+ }
1665
+ for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
1666
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1667
+ (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
1668
+ (const char *) src0->data + src0_start * nb01,
1669
+ (const char *) src1_wdata + (src1_col_stride * iter), 1,
1670
+ src0_end - src0_start);
1671
+ }
1672
+ }
1673
+
1674
+ void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
1675
+ const ggml_tensor * src0 = op->src[0];
1676
+ const ggml_tensor * src1 = op->src[1];
1677
+ const ggml_tensor * ids = op->src[2];
1678
+ ggml_tensor * dst = op;
1679
+
1680
+ GGML_TENSOR_BINARY_OP_LOCALS
1681
+
1682
+ const int ith = params->ith;
1683
+ const int nth = params->nth;
1684
+
1685
+ const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
1686
+
1687
+ // we don't support permuted src0 or src1
1688
+ GGML_ASSERT(nb00 == ggml_type_size(src0->type));
1689
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
1690
+
1691
+ // dst cannot be transposed or permuted
1692
+ GGML_ASSERT(nb0 == sizeof(float));
1693
+ GGML_ASSERT(nb0 <= nb1);
1694
+ GGML_ASSERT(nb1 <= nb2);
1695
+ GGML_ASSERT(nb2 <= nb3);
1696
+
1697
+ GGML_ASSERT(ne03 == 1);
1698
+ GGML_ASSERT(ne13 == 1);
1699
+ GGML_ASSERT(ne3 == 1);
1700
+
1701
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
1702
+
1703
+ // row groups
1704
+ const int n_ids = ids->ne[0]; // n_expert_used
1705
+ const int n_as = ne02; // n_expert
1706
+
1707
+ const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
1708
+ const size_t nbw2 = nbw1*ne11;
1709
+ const size_t nbw3 = nbw2*ne12;
1710
+
1711
+ struct mmid_row_mapping {
1712
+ int32_t i1;
1713
+ int32_t i2;
1714
+ };
1715
+
1716
+ GGML_ASSERT(params->wsize >=
1717
+ (GGML_PAD(nbw3, sizeof(int64_t)) +
1718
+ n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
1719
+ );
1720
+
1721
+ auto * wdata = (char *)params->wdata;
1722
+ auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
1723
+
1724
+ // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
1725
+ auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1726
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
1727
+
1728
+ // src1: float32 => param type
1729
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
1730
+ for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
1731
+ from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
1732
+ (void *) (wdata + i12 * nbw2 + i11 * nbw1),
1733
+ ne10);
1734
+ }
1735
+ }
1736
+
1737
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
1738
+
1739
+ if (ith == 0) {
1740
+ // initialize matrix_row_counts
1741
+ memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
1742
+
1743
+ // group rows by src0 matrix
1744
+ for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
1745
+ for (int32_t id = 0; id < n_ids; ++id) {
1746
+ const int32_t i02 =
1747
+ *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
1748
+
1749
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
1750
+
1751
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
1752
+ matrix_row_counts[i02] += 1;
1753
+ }
1754
+ }
1755
+ }
1756
+
1757
+ ggml_barrier(params->threadpool);
1758
+
1759
+ // compute each matrix multiplication in sequence
1760
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
1761
+ const int64_t cne1 = matrix_row_counts[cur_a];
1762
+
1763
+ if (cne1 == 0) {
1764
+ continue;
1765
+ }
1766
+
1767
+ const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
1768
+
1769
+ //const int64_t nr0 = ne01; // src0 rows
1770
+ const int64_t nr1 = cne1; // src1 rows
1771
+
1772
+ int64_t src0_cur_start = (ith * ne01) / nth;
1773
+ int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
1774
+
1775
+ src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
1776
+ src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
1777
+
1778
+ if (src0_cur_start >= src0_cur_end) {
1779
+ return;
1780
+ }
1781
+
1782
+ for (int ir1 = 0; ir1 < nr1; ir1++) {
1783
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
1784
+
1785
+ const int id = row_mapping.i1; // selected expert index
1786
+
1787
+ const int64_t i11 = id % ne11;
1788
+ const int64_t i12 = row_mapping.i2; // row index in src1
1789
+
1790
+ const int64_t i1 = id; // selected expert index
1791
+ const int64_t i2 = i12; // row
1792
+
1793
+ const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
1794
+
1795
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
1796
+ (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
1797
+ src0_cur + src0_cur_start * nb01,
1798
+ src1_col, 1, src0_cur_end - src0_cur_start);
1799
+ }
1800
+ }
1801
+ #undef MMID_MATRIX_ROW
1802
+ }
1803
+
1804
+ int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
1805
+ GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
1806
+ (int) NB_COLS, (int) INTER_SIZE);
1807
+ return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
1808
+ }
1809
+ };
1810
+
1811
+ } // namespace ggml::cpu::repack
1812
+
1813
+ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
1814
+
1815
+ // instance for Q4
1816
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
1817
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
1818
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1819
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1820
+
1821
+ // instance for Q2
1822
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
1823
+
1824
+ // instance for IQ4
1825
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1826
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
1827
+
1828
+ if (cur->type == GGML_TYPE_Q4_0) {
1829
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
1830
+ if (cur->ne[1] % 8 == 0) {
1831
+ return &q4_0_8x8_q8_0;
1832
+ }
1833
+ }
1834
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
1835
+ if (cur->ne[1] % 4 == 0) {
1836
+ return &q4_0_4x8_q8_0;
1837
+ }
1838
+ }
1839
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1840
+ if (cur->ne[1] % 4 == 0) {
1841
+ return &q4_0_4x4_q8_0;
1842
+ }
1843
+ }
1844
+ } else if (cur->type == GGML_TYPE_Q4_K) {
1845
+ if (ggml_cpu_has_avx2()) {
1846
+ if (cur->ne[1] % 8 == 0) {
1847
+ return &q4_K_8x8_q8_K;
1848
+ }
1849
+ }
1850
+ } else if (cur->type == GGML_TYPE_Q2_K) {
1851
+ if (ggml_cpu_has_avx512()) {
1852
+ if (cur->ne[1] % 8 == 0) {
1853
+ return &q2_K_8x8_q8_K;
1854
+ }
1855
+ }
1856
+ } else if (cur->type == GGML_TYPE_IQ4_NL) {
1857
+ if (ggml_cpu_has_avx2()) {
1858
+ if (cur->ne[1] % 8 == 0) {
1859
+ return &iq4_nl_8x8_q8_0;
1860
+ }
1861
+ }
1862
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1863
+ if (cur->ne[1] % 4 == 0) {
1864
+ return &iq4_nl_4x4_q8_0;
1865
+ }
1866
+ }
1867
+ }
1868
+
1869
+ return nullptr;
1870
+ }
1871
+
1872
+ static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1873
+ tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
1874
+
1875
+ GGML_UNUSED(buffer);
1876
+ return GGML_STATUS_SUCCESS;
1877
+ }
1878
+
1879
+ static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
1880
+ const void * data, size_t offset, size_t size) {
1881
+ GGML_ASSERT(offset == 0);
1882
+ GGML_ASSERT(size == ggml_nbytes(tensor));
1883
+
1884
+ auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
1885
+ auto OK = tensor_traits->repack(tensor, data, size);
1886
+
1887
+ GGML_ASSERT(OK == 0);
1888
+ GGML_UNUSED(buffer);
1889
+ }
1890
+
1891
+ static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
1892
+ return "CPU_REPACK";
1893
+
1894
+ GGML_UNUSED(buft);
1895
+ }
1896
+
1897
+ static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1898
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1899
+
1900
+ if (buffer == nullptr) {
1901
+ return nullptr;
1902
+ }
1903
+
1904
+ buffer->buft = buft;
1905
+ buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
1906
+ buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor;
1907
+ buffer->iface.get_tensor = nullptr;
1908
+ buffer->iface.cpy_tensor = nullptr;
1909
+ return buffer;
1910
+ }
1911
+
1912
+ static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1913
+ return TENSOR_ALIGNMENT;
1914
+
1915
+ GGML_UNUSED(buft);
1916
+ }
1917
+
1918
+ namespace ggml::cpu::repack {
1919
+ class extra_buffer_type : ggml::cpu::extra_buffer_type {
1920
+ bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
1921
+ if ( op->op == GGML_OP_MUL_MAT &&
1922
+ op->src[0]->buffer &&
1923
+ (ggml_n_dims(op->src[0]) == 2) &&
1924
+ op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
1925
+ ggml_repack_get_optimal_repack_type(op->src[0])
1926
+ ) {
1927
+ if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
1928
+ return false;
1929
+ }
1930
+ if (op->src[1]->type == GGML_TYPE_F32) {
1931
+ return true;
1932
+ }
1933
+ //if (op->src[1]->type == GGML_TYPE_Q8_0) {
1934
+ // return true;
1935
+ //}
1936
+ // may be possible if Q8_0 packed...
1937
+ } else if (op->op == GGML_OP_MUL_MAT_ID
1938
+ && op->src[0]->buffer
1939
+ && (ggml_n_dims(op->src[0]) == 3)
1940
+ && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
1941
+ && ggml_repack_get_optimal_repack_type(op->src[0])
1942
+ ) {
1943
+ if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
1944
+ return false;
1945
+ }
1946
+ if (op->src[1]->type == GGML_TYPE_F32) {
1947
+ return true;
1948
+ }
1949
+ //if (op->src[1]->type == GGML_TYPE_Q8_0) {
1950
+ // return true;
1951
+ //}
1952
+ }
1953
+ return false;
1954
+ }
1955
+
1956
+ ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
1957
+ if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
1958
+ if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
1959
+ return (ggml::cpu::tensor_traits *) op->src[0]->extra;
1960
+ }
1961
+ }
1962
+ return nullptr;
1963
+ }
1964
+ };
1965
+ } // namespace ggml::cpu::repack
1966
+
1967
+ ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
1968
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
1969
+ /* .iface = */ {
1970
+ /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name,
1971
+ /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
1972
+ /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
1973
+ /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
1974
+ /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
1975
+ /* .is_host = */ nullptr,
1976
+ },
1977
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
1978
+ /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
1979
+ };
1980
+
1981
+ return &ggml_backend_cpu_buffer_type_repack;
1982
+ }