@agency-lang/whisper-local 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (643) hide show
  1. package/CMakeLists.txt +51 -0
  2. package/README.md +145 -0
  3. package/build/Release/whisper_addon.node +0 -0
  4. package/dist/src/addon.d.ts +11 -0
  5. package/dist/src/addon.js +22 -0
  6. package/dist/src/cli.d.ts +2 -0
  7. package/dist/src/cli.js +117 -0
  8. package/dist/src/ffmpeg.d.ts +11 -0
  9. package/dist/src/ffmpeg.js +154 -0
  10. package/dist/src/handleCache.d.ts +9 -0
  11. package/dist/src/handleCache.js +83 -0
  12. package/dist/src/modelManager.d.ts +12 -0
  13. package/dist/src/modelManager.js +172 -0
  14. package/dist/src/packageRoot.d.ts +8 -0
  15. package/dist/src/packageRoot.js +21 -0
  16. package/dist/src/transcribe.d.ts +2 -0
  17. package/dist/src/transcribe.js +36 -0
  18. package/dist/src/types.d.ts +11 -0
  19. package/dist/src/types.js +17 -0
  20. package/index.agency +32 -0
  21. package/models.lock.json +55 -0
  22. package/package.json +52 -0
  23. package/vendor/whisper.cpp/CMakeLists.txt +251 -0
  24. package/vendor/whisper.cpp/LICENSE +21 -0
  25. package/vendor/whisper.cpp/UPSTREAM_SHA256 +1 -0
  26. package/vendor/whisper.cpp/VERSION +1 -0
  27. package/vendor/whisper.cpp/cmake/DefaultTargetOptions.cmake +16 -0
  28. package/vendor/whisper.cpp/cmake/FindFFmpeg.cmake +163 -0
  29. package/vendor/whisper.cpp/cmake/build-info.cmake +60 -0
  30. package/vendor/whisper.cpp/cmake/git-vars.cmake +22 -0
  31. package/vendor/whisper.cpp/cmake/whisper-config.cmake.in +65 -0
  32. package/vendor/whisper.cpp/cmake/whisper.pc.in +10 -0
  33. package/vendor/whisper.cpp/ggml/CMakeLists.txt +434 -0
  34. package/vendor/whisper.cpp/ggml/cmake/BuildTypes.cmake +54 -0
  35. package/vendor/whisper.cpp/ggml/cmake/GitVars.cmake +22 -0
  36. package/vendor/whisper.cpp/ggml/cmake/common.cmake +50 -0
  37. package/vendor/whisper.cpp/ggml/cmake/ggml-config.cmake.in +152 -0
  38. package/vendor/whisper.cpp/ggml/include/ggml-alloc.h +76 -0
  39. package/vendor/whisper.cpp/ggml/include/ggml-backend.h +354 -0
  40. package/vendor/whisper.cpp/ggml/include/ggml-blas.h +25 -0
  41. package/vendor/whisper.cpp/ggml/include/ggml-cann.h +123 -0
  42. package/vendor/whisper.cpp/ggml/include/ggml-cpp.h +39 -0
  43. package/vendor/whisper.cpp/ggml/include/ggml-cpu.h +143 -0
  44. package/vendor/whisper.cpp/ggml/include/ggml-cuda.h +47 -0
  45. package/vendor/whisper.cpp/ggml/include/ggml-kompute.h +50 -0
  46. package/vendor/whisper.cpp/ggml/include/ggml-metal.h +66 -0
  47. package/vendor/whisper.cpp/ggml/include/ggml-opencl.h +26 -0
  48. package/vendor/whisper.cpp/ggml/include/ggml-opt.h +237 -0
  49. package/vendor/whisper.cpp/ggml/include/ggml-rpc.h +33 -0
  50. package/vendor/whisper.cpp/ggml/include/ggml-sycl.h +49 -0
  51. package/vendor/whisper.cpp/ggml/include/ggml-vulkan.h +29 -0
  52. package/vendor/whisper.cpp/ggml/include/ggml.h +2221 -0
  53. package/vendor/whisper.cpp/ggml/include/gguf.h +202 -0
  54. package/vendor/whisper.cpp/ggml/src/CMakeLists.txt +404 -0
  55. package/vendor/whisper.cpp/ggml/src/ggml-alloc.c +1042 -0
  56. package/vendor/whisper.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  57. package/vendor/whisper.cpp/ggml/src/ggml-amx/common.h +94 -0
  58. package/vendor/whisper.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  59. package/vendor/whisper.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  60. package/vendor/whisper.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  61. package/vendor/whisper.cpp/ggml/src/ggml-backend-impl.h +255 -0
  62. package/vendor/whisper.cpp/ggml/src/ggml-backend-reg.cpp +591 -0
  63. package/vendor/whisper.cpp/ggml/src/ggml-backend.cpp +2016 -0
  64. package/vendor/whisper.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  65. package/vendor/whisper.cpp/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  66. package/vendor/whisper.cpp/ggml/src/ggml-cann/CMakeLists.txt +75 -0
  67. package/vendor/whisper.cpp/ggml/src/ggml-cann/Doxyfile +2579 -0
  68. package/vendor/whisper.cpp/ggml/src/ggml-cann/acl_tensor.cpp +181 -0
  69. package/vendor/whisper.cpp/ggml/src/ggml-cann/acl_tensor.h +258 -0
  70. package/vendor/whisper.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +3193 -0
  71. package/vendor/whisper.cpp/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  72. package/vendor/whisper.cpp/ggml/src/ggml-cann/common.h +425 -0
  73. package/vendor/whisper.cpp/ggml/src/ggml-cann/ggml-cann.cpp +2630 -0
  74. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  75. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  76. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/dup.cpp +234 -0
  77. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  78. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  79. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  80. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  81. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  82. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  83. package/vendor/whisper.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  84. package/vendor/whisper.cpp/ggml/src/ggml-common.h +1861 -0
  85. package/vendor/whisper.cpp/ggml/src/ggml-cpu/CMakeLists.txt +584 -0
  86. package/vendor/whisper.cpp/ggml/src/ggml-cpu/amx/amx.cpp +221 -0
  87. package/vendor/whisper.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  88. package/vendor/whisper.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  89. package/vendor/whisper.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  90. package/vendor/whisper.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  91. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  92. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  93. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
  94. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  95. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  96. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  97. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  98. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  99. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  100. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  101. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  102. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  103. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3284 -0
  104. package/vendor/whisper.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  105. package/vendor/whisper.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  106. package/vendor/whisper.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  107. package/vendor/whisper.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  108. package/vendor/whisper.cpp/ggml/src/ggml-cpu/common.h +72 -0
  109. package/vendor/whisper.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +511 -0
  110. package/vendor/whisper.cpp/ggml/src/ggml-cpu/ggml-cpu.c +3473 -0
  111. package/vendor/whisper.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +671 -0
  112. package/vendor/whisper.cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  113. package/vendor/whisper.cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  114. package/vendor/whisper.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  115. package/vendor/whisper.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  116. package/vendor/whisper.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  117. package/vendor/whisper.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  118. package/vendor/whisper.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3593 -0
  119. package/vendor/whisper.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +19 -0
  120. package/vendor/whisper.cpp/ggml/src/ggml-cpu/ops.cpp +9085 -0
  121. package/vendor/whisper.cpp/ggml/src/ggml-cpu/ops.h +111 -0
  122. package/vendor/whisper.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  123. package/vendor/whisper.cpp/ggml/src/ggml-cpu/quants.h +89 -0
  124. package/vendor/whisper.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
  125. package/vendor/whisper.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  126. package/vendor/whisper.cpp/ggml/src/ggml-cpu/simd-mappings.h +1006 -0
  127. package/vendor/whisper.cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  128. package/vendor/whisper.cpp/ggml/src/ggml-cpu/traits.h +38 -0
  129. package/vendor/whisper.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  130. package/vendor/whisper.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  131. package/vendor/whisper.cpp/ggml/src/ggml-cpu/vec.cpp +321 -0
  132. package/vendor/whisper.cpp/ggml/src/ggml-cpu/vec.h +973 -0
  133. package/vendor/whisper.cpp/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  134. package/vendor/whisper.cpp/ggml/src/ggml-cuda/acc.cu +61 -0
  135. package/vendor/whisper.cpp/ggml/src/ggml-cuda/acc.cuh +5 -0
  136. package/vendor/whisper.cpp/ggml/src/ggml-cuda/arange.cu +34 -0
  137. package/vendor/whisper.cpp/ggml/src/ggml-cuda/arange.cuh +5 -0
  138. package/vendor/whisper.cpp/ggml/src/ggml-cuda/argmax.cu +91 -0
  139. package/vendor/whisper.cpp/ggml/src/ggml-cuda/argmax.cuh +3 -0
  140. package/vendor/whisper.cpp/ggml/src/ggml-cuda/argsort.cu +104 -0
  141. package/vendor/whisper.cpp/ggml/src/ggml-cuda/argsort.cuh +3 -0
  142. package/vendor/whisper.cpp/ggml/src/ggml-cuda/binbcast.cu +363 -0
  143. package/vendor/whisper.cpp/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  144. package/vendor/whisper.cpp/ggml/src/ggml-cuda/clamp.cu +45 -0
  145. package/vendor/whisper.cpp/ggml/src/ggml-cuda/clamp.cuh +5 -0
  146. package/vendor/whisper.cpp/ggml/src/ggml-cuda/common.cuh +812 -0
  147. package/vendor/whisper.cpp/ggml/src/ggml-cuda/concat.cu +221 -0
  148. package/vendor/whisper.cpp/ggml/src/ggml-cuda/concat.cuh +5 -0
  149. package/vendor/whisper.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  150. package/vendor/whisper.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  151. package/vendor/whisper.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  152. package/vendor/whisper.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  153. package/vendor/whisper.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  154. package/vendor/whisper.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  155. package/vendor/whisper.cpp/ggml/src/ggml-cuda/convert.cu +730 -0
  156. package/vendor/whisper.cpp/ggml/src/ggml-cuda/convert.cuh +26 -0
  157. package/vendor/whisper.cpp/ggml/src/ggml-cuda/count-equal.cu +64 -0
  158. package/vendor/whisper.cpp/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  159. package/vendor/whisper.cpp/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  160. package/vendor/whisper.cpp/ggml/src/ggml-cuda/cpy.cu +705 -0
  161. package/vendor/whisper.cpp/ggml/src/ggml-cuda/cpy.cuh +11 -0
  162. package/vendor/whisper.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  163. package/vendor/whisper.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  164. package/vendor/whisper.cpp/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  165. package/vendor/whisper.cpp/ggml/src/ggml-cuda/diagmask.cu +40 -0
  166. package/vendor/whisper.cpp/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  167. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  168. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1474 -0
  169. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  170. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  171. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  172. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  173. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  174. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  175. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  176. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  177. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn.cu +346 -0
  178. package/vendor/whisper.cpp/ggml/src/ggml-cuda/fattn.cuh +3 -0
  179. package/vendor/whisper.cpp/ggml/src/ggml-cuda/getrows.cu +275 -0
  180. package/vendor/whisper.cpp/ggml/src/ggml-cuda/getrows.cuh +15 -0
  181. package/vendor/whisper.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +3562 -0
  182. package/vendor/whisper.cpp/ggml/src/ggml-cuda/gla.cu +93 -0
  183. package/vendor/whisper.cpp/ggml/src/ggml-cuda/gla.cuh +3 -0
  184. package/vendor/whisper.cpp/ggml/src/ggml-cuda/im2col.cu +103 -0
  185. package/vendor/whisper.cpp/ggml/src/ggml-cuda/im2col.cuh +5 -0
  186. package/vendor/whisper.cpp/ggml/src/ggml-cuda/mma.cuh +396 -0
  187. package/vendor/whisper.cpp/ggml/src/ggml-cuda/mmq.cu +324 -0
  188. package/vendor/whisper.cpp/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  189. package/vendor/whisper.cpp/ggml/src/ggml-cuda/mmv.cu +336 -0
  190. package/vendor/whisper.cpp/ggml/src/ggml-cuda/mmv.cuh +12 -0
  191. package/vendor/whisper.cpp/ggml/src/ggml-cuda/mmvq.cu +595 -0
  192. package/vendor/whisper.cpp/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  193. package/vendor/whisper.cpp/ggml/src/ggml-cuda/norm.cu +458 -0
  194. package/vendor/whisper.cpp/ggml/src/ggml-cuda/norm.cuh +11 -0
  195. package/vendor/whisper.cpp/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  196. package/vendor/whisper.cpp/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  197. package/vendor/whisper.cpp/ggml/src/ggml-cuda/out-prod.cu +68 -0
  198. package/vendor/whisper.cpp/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  199. package/vendor/whisper.cpp/ggml/src/ggml-cuda/pad.cu +49 -0
  200. package/vendor/whisper.cpp/ggml/src/ggml-cuda/pad.cuh +5 -0
  201. package/vendor/whisper.cpp/ggml/src/ggml-cuda/pool2d.cu +94 -0
  202. package/vendor/whisper.cpp/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  203. package/vendor/whisper.cpp/ggml/src/ggml-cuda/quantize.cu +190 -0
  204. package/vendor/whisper.cpp/ggml/src/ggml-cuda/quantize.cuh +27 -0
  205. package/vendor/whisper.cpp/ggml/src/ggml-cuda/rope.cu +456 -0
  206. package/vendor/whisper.cpp/ggml/src/ggml-cuda/rope.cuh +7 -0
  207. package/vendor/whisper.cpp/ggml/src/ggml-cuda/scale.cu +31 -0
  208. package/vendor/whisper.cpp/ggml/src/ggml-cuda/scale.cuh +5 -0
  209. package/vendor/whisper.cpp/ggml/src/ggml-cuda/softmax.cu +283 -0
  210. package/vendor/whisper.cpp/ggml/src/ggml-cuda/softmax.cuh +7 -0
  211. package/vendor/whisper.cpp/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  212. package/vendor/whisper.cpp/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  213. package/vendor/whisper.cpp/ggml/src/ggml-cuda/ssm-scan.cu +155 -0
  214. package/vendor/whisper.cpp/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  215. package/vendor/whisper.cpp/ggml/src/ggml-cuda/sum.cu +45 -0
  216. package/vendor/whisper.cpp/ggml/src/ggml-cuda/sum.cuh +5 -0
  217. package/vendor/whisper.cpp/ggml/src/ggml-cuda/sumrows.cu +39 -0
  218. package/vendor/whisper.cpp/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  219. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  220. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  221. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  222. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  223. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  224. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  225. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  226. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  227. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  228. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  229. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  230. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  231. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  232. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  233. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  234. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  235. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  236. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  237. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  238. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  239. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  240. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  241. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  242. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  243. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  244. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  245. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  246. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  247. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  248. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  249. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  250. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  251. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  252. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  253. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  254. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  255. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  256. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  257. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  258. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  259. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  260. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  261. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  262. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  263. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  264. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  265. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  266. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  267. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  268. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  269. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  270. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  271. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  272. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  273. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  274. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  275. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  276. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  277. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  278. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  279. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  280. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  281. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  282. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  283. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  284. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  285. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  286. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  287. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  288. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  289. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  290. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  291. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  292. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  293. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  294. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  295. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  296. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  297. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  298. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  299. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  300. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  301. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  302. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  303. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  304. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  305. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  306. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  307. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  308. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  309. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  310. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  311. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  312. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  313. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  314. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  315. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  316. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  317. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  318. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  319. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  320. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  321. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  322. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  323. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  324. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  325. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  326. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  327. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  328. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  329. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  330. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  331. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  332. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  333. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  334. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  335. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  336. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  337. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  338. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  339. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  340. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  341. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  342. package/vendor/whisper.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  343. package/vendor/whisper.cpp/ggml/src/ggml-cuda/tsembd.cu +47 -0
  344. package/vendor/whisper.cpp/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  345. package/vendor/whisper.cpp/ggml/src/ggml-cuda/unary.cu +289 -0
  346. package/vendor/whisper.cpp/ggml/src/ggml-cuda/unary.cuh +59 -0
  347. package/vendor/whisper.cpp/ggml/src/ggml-cuda/upscale.cu +51 -0
  348. package/vendor/whisper.cpp/ggml/src/ggml-cuda/upscale.cuh +5 -0
  349. package/vendor/whisper.cpp/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  350. package/vendor/whisper.cpp/ggml/src/ggml-cuda/vendors/cuda.h +15 -0
  351. package/vendor/whisper.cpp/ggml/src/ggml-cuda/vendors/hip.h +243 -0
  352. package/vendor/whisper.cpp/ggml/src/ggml-cuda/vendors/musa.h +140 -0
  353. package/vendor/whisper.cpp/ggml/src/ggml-cuda/wkv.cu +199 -0
  354. package/vendor/whisper.cpp/ggml/src/ggml-cuda/wkv.cuh +7 -0
  355. package/vendor/whisper.cpp/ggml/src/ggml-hip/CMakeLists.txt +135 -0
  356. package/vendor/whisper.cpp/ggml/src/ggml-impl.h +603 -0
  357. package/vendor/whisper.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  358. package/vendor/whisper.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  359. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  360. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  361. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  362. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  363. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  364. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  365. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  366. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  367. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  368. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  369. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  370. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  371. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  372. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  373. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  374. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  375. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  376. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  377. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  378. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  379. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  380. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  381. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  382. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  383. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  384. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  385. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  386. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  387. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  388. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  389. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  390. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  391. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  392. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  393. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  394. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  395. package/vendor/whisper.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  396. package/vendor/whisper.cpp/ggml/src/ggml-metal/CMakeLists.txt +121 -0
  397. package/vendor/whisper.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
  398. package/vendor/whisper.cpp/ggml/src/ggml-metal/ggml-metal.m +6023 -0
  399. package/vendor/whisper.cpp/ggml/src/ggml-metal/ggml-metal.metal +7124 -0
  400. package/vendor/whisper.cpp/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  401. package/vendor/whisper.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  402. package/vendor/whisper.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  403. package/vendor/whisper.cpp/ggml/src/ggml-opencl/CMakeLists.txt +109 -0
  404. package/vendor/whisper.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6665 -0
  405. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  406. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  407. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  408. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  409. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  410. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  411. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  412. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  413. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  414. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  415. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  416. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  417. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  418. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  419. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  420. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  421. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  422. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  423. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  424. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  425. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  426. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  427. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  428. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  429. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  430. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  431. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  432. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  433. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  434. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  435. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  436. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  437. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  438. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  439. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  440. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  441. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  442. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  443. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  444. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  445. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  446. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  447. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  448. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  449. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  450. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  451. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  452. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  453. package/vendor/whisper.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  454. package/vendor/whisper.cpp/ggml/src/ggml-opt.cpp +1037 -0
  455. package/vendor/whisper.cpp/ggml/src/ggml-quants.c +5230 -0
  456. package/vendor/whisper.cpp/ggml/src/ggml-quants.h +100 -0
  457. package/vendor/whisper.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  458. package/vendor/whisper.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +1816 -0
  459. package/vendor/whisper.cpp/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  460. package/vendor/whisper.cpp/ggml/src/ggml-sycl/backend.hpp +37 -0
  461. package/vendor/whisper.cpp/ggml/src/ggml-sycl/binbcast.cpp +344 -0
  462. package/vendor/whisper.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  463. package/vendor/whisper.cpp/ggml/src/ggml-sycl/common.cpp +83 -0
  464. package/vendor/whisper.cpp/ggml/src/ggml-sycl/common.hpp +584 -0
  465. package/vendor/whisper.cpp/ggml/src/ggml-sycl/concat.cpp +182 -0
  466. package/vendor/whisper.cpp/ggml/src/ggml-sycl/concat.hpp +20 -0
  467. package/vendor/whisper.cpp/ggml/src/ggml-sycl/conv.cpp +95 -0
  468. package/vendor/whisper.cpp/ggml/src/ggml-sycl/conv.hpp +20 -0
  469. package/vendor/whisper.cpp/ggml/src/ggml-sycl/convert.cpp +575 -0
  470. package/vendor/whisper.cpp/ggml/src/ggml-sycl/convert.hpp +34 -0
  471. package/vendor/whisper.cpp/ggml/src/ggml-sycl/cpy.cpp +839 -0
  472. package/vendor/whisper.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  473. package/vendor/whisper.cpp/ggml/src/ggml-sycl/dequantize.hpp +823 -0
  474. package/vendor/whisper.cpp/ggml/src/ggml-sycl/dmmv.cpp +1144 -0
  475. package/vendor/whisper.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  476. package/vendor/whisper.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2987 -0
  477. package/vendor/whisper.cpp/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
  478. package/vendor/whisper.cpp/ggml/src/ggml-sycl/element_wise.hpp +77 -0
  479. package/vendor/whisper.cpp/ggml/src/ggml-sycl/gemm.hpp +102 -0
  480. package/vendor/whisper.cpp/ggml/src/ggml-sycl/getrows.cpp +212 -0
  481. package/vendor/whisper.cpp/ggml/src/ggml-sycl/getrows.hpp +20 -0
  482. package/vendor/whisper.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +4608 -0
  483. package/vendor/whisper.cpp/ggml/src/ggml-sycl/gla.cpp +106 -0
  484. package/vendor/whisper.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  485. package/vendor/whisper.cpp/ggml/src/ggml-sycl/im2col.cpp +136 -0
  486. package/vendor/whisper.cpp/ggml/src/ggml-sycl/im2col.hpp +21 -0
  487. package/vendor/whisper.cpp/ggml/src/ggml-sycl/mmq.cpp +3010 -0
  488. package/vendor/whisper.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  489. package/vendor/whisper.cpp/ggml/src/ggml-sycl/mmvq.cpp +1065 -0
  490. package/vendor/whisper.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  491. package/vendor/whisper.cpp/ggml/src/ggml-sycl/norm.cpp +482 -0
  492. package/vendor/whisper.cpp/ggml/src/ggml-sycl/norm.hpp +26 -0
  493. package/vendor/whisper.cpp/ggml/src/ggml-sycl/outprod.cpp +47 -0
  494. package/vendor/whisper.cpp/ggml/src/ggml-sycl/outprod.hpp +10 -0
  495. package/vendor/whisper.cpp/ggml/src/ggml-sycl/presets.hpp +74 -0
  496. package/vendor/whisper.cpp/ggml/src/ggml-sycl/quants.hpp +111 -0
  497. package/vendor/whisper.cpp/ggml/src/ggml-sycl/rope.cpp +472 -0
  498. package/vendor/whisper.cpp/ggml/src/ggml-sycl/rope.hpp +20 -0
  499. package/vendor/whisper.cpp/ggml/src/ggml-sycl/softmax.cpp +261 -0
  500. package/vendor/whisper.cpp/ggml/src/ggml-sycl/softmax.hpp +20 -0
  501. package/vendor/whisper.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  502. package/vendor/whisper.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  503. package/vendor/whisper.cpp/ggml/src/ggml-sycl/tsembd.cpp +67 -0
  504. package/vendor/whisper.cpp/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  505. package/vendor/whisper.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1307 -0
  506. package/vendor/whisper.cpp/ggml/src/ggml-sycl/wkv.cpp +289 -0
  507. package/vendor/whisper.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  508. package/vendor/whisper.cpp/ggml/src/ggml-threading.cpp +12 -0
  509. package/vendor/whisper.cpp/ggml/src/ggml-threading.h +14 -0
  510. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +189 -0
  511. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  512. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +10937 -0
  513. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +27 -0
  514. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  515. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  516. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  517. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  518. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  519. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  520. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  521. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  522. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  523. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  524. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  525. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  526. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  527. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  528. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  529. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  530. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  531. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  532. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  533. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  534. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  535. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  536. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  537. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  538. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  539. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  540. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  541. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  542. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  543. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  544. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  545. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  546. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  547. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  548. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  549. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  550. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  551. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  552. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  553. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  554. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  555. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  556. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  557. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  558. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  559. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  560. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  561. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  562. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  563. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  564. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  565. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  566. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  567. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  568. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  569. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  570. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  571. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  572. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  573. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  574. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  575. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  576. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  577. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  578. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  579. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  580. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  581. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  582. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  583. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  584. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  585. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  586. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  587. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  588. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  589. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  590. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  591. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  592. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  593. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  594. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  595. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  596. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  597. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  598. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  599. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  600. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  601. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  602. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  603. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  604. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  605. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  606. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  607. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  608. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  609. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  610. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  611. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  612. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  613. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  614. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  615. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  616. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  617. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  618. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  619. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  620. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  621. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  622. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  623. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  624. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +753 -0
  625. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  626. package/vendor/whisper.cpp/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  627. package/vendor/whisper.cpp/ggml/src/ggml.c +6601 -0
  628. package/vendor/whisper.cpp/ggml/src/ggml.cpp +26 -0
  629. package/vendor/whisper.cpp/ggml/src/gguf.cpp +1347 -0
  630. package/vendor/whisper.cpp/include/whisper.h +738 -0
  631. package/vendor/whisper.cpp/src/CMakeLists.txt +145 -0
  632. package/vendor/whisper.cpp/src/coreml/whisper-compat.h +10 -0
  633. package/vendor/whisper.cpp/src/coreml/whisper-compat.m +35 -0
  634. package/vendor/whisper.cpp/src/coreml/whisper-decoder-impl.h +158 -0
  635. package/vendor/whisper.cpp/src/coreml/whisper-decoder-impl.m +227 -0
  636. package/vendor/whisper.cpp/src/coreml/whisper-encoder-impl.h +154 -0
  637. package/vendor/whisper.cpp/src/coreml/whisper-encoder-impl.m +223 -0
  638. package/vendor/whisper.cpp/src/coreml/whisper-encoder.h +26 -0
  639. package/vendor/whisper.cpp/src/coreml/whisper-encoder.mm +73 -0
  640. package/vendor/whisper.cpp/src/openvino/whisper-openvino-encoder.cpp +108 -0
  641. package/vendor/whisper.cpp/src/openvino/whisper-openvino-encoder.h +31 -0
  642. package/vendor/whisper.cpp/src/whisper-arch.h +197 -0
  643. package/vendor/whisper.cpp/src/whisper.cpp +8969 -0
@@ -0,0 +1,1299 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+ #include "ggml-quants.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-cpu.h"
6
+
7
+ #include "../../quants.h"
8
+ #include "../../ggml-cpu-impl.h"
9
+
10
+ #include <math.h>
11
+ #include <string.h>
12
+ #include <assert.h>
13
+ #include <float.h>
14
+ #include <stdlib.h> // for qsort
15
+ #include <stdio.h> // for GGML_ASSERT
16
+
17
+ #define GROUP_MAX_EPS 1e-15f
18
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
19
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
20
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
21
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
22
+
23
+ #define UNUSED GGML_UNUSED
24
+
25
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
26
+ assert(QK8_0 == 32);
27
+ assert(k % QK8_0 == 0);
28
+ const int nb = k / QK8_0;
29
+
30
+ block_q8_0 * GGML_RESTRICT y = vy;
31
+
32
+ #if defined(__VXE__) || defined(__VXE2__)
33
+ for (int i = 0; i < nb; i++) {
34
+ __vector float srcv [8];
35
+ __vector float asrcv[8];
36
+ __vector float amaxv[8];
37
+
38
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
39
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
40
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
41
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
42
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
43
+
44
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
45
+ vec_extract(amaxv[0], 1)),
46
+ MAX(vec_extract(amaxv[0], 2),
47
+ vec_extract(amaxv[0], 3)));
48
+
49
+ const float d = amax / ((1 << 7) - 1);
50
+ const float id = d ? 1.0f / d : 0.0f;
51
+
52
+ y[i].d = GGML_FP32_TO_FP16(d);
53
+
54
+ for (int j = 0; j < 8; j++) {
55
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
56
+ const __vector int32_t vi = vec_signed(v);
57
+
58
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
59
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
60
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
61
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
62
+ }
63
+ }
64
+ #else
65
+ GGML_UNUSED(nb);
66
+ // scalar
67
+ quantize_row_q8_0_ref(x, y, k);
68
+ #endif
69
+ }
70
+
71
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
72
+ assert(k % QK8_1 == 0);
73
+ const int nb = k / QK8_1;
74
+
75
+ block_q8_1 * GGML_RESTRICT y = vy;
76
+
77
+ #if defined(__VXE__) || defined(__VXE2__)
78
+ for (int i = 0; i < nb; i++) {
79
+ __vector float srcv [8];
80
+ __vector float asrcv[8];
81
+ __vector float amaxv[8];
82
+
83
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
84
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
85
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
86
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
87
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
88
+
89
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
90
+ vec_extract(amaxv[0], 1)),
91
+ MAX(vec_extract(amaxv[0], 2),
92
+ vec_extract(amaxv[0], 3)));
93
+
94
+ const float d = amax / ((1 << 7) - 1);
95
+ const float id = d ? 1.0f / d : 0.0f;
96
+
97
+ y[i].d = GGML_FP32_TO_FP16(d);
98
+
99
+ __vector int32_t acc = vec_splats(0);
100
+
101
+ for (int j = 0; j < 8; j++) {
102
+ const __vector float v = vec_mul(srcv[j], vec_splats(id));
103
+ const __vector int32_t vi = vec_signed(v);
104
+
105
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
106
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
107
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
108
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
109
+
110
+ acc = vec_add(acc, vi);
111
+ }
112
+
113
+ y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
114
+ }
115
+ #else
116
+ GGML_UNUSED(nb);
117
+ // scalar
118
+ quantize_row_q8_1_ref(x, y, k);
119
+ #endif
120
+ }
121
+
122
+
123
+ //===================================== Dot products =================================
124
+
125
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
126
+ const int qk = QK8_0;
127
+ const int nb = n / qk;
128
+
129
+ assert(n % qk == 0);
130
+ assert(nrc == 1);
131
+ UNUSED(nrc);
132
+ UNUSED(bx);
133
+ UNUSED(by);
134
+ UNUSED(bs);
135
+
136
+ const block_q4_0 * GGML_RESTRICT x = vx;
137
+ const block_q8_0 * GGML_RESTRICT y = vy;
138
+
139
+ int ib = 0;
140
+ float sumf = 0;
141
+
142
+ #if defined(__VXE__) || defined(__VXE2__)
143
+ __vector float acc = vec_splats(0.0f);
144
+
145
+ const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
146
+ const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
147
+
148
+ for (; ib < nb; ++ib) {
149
+ const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
150
+ const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
151
+ const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
152
+
153
+ const __vector int8_t v_xls = vec_sub(v_xl, v_s);
154
+ const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
155
+
156
+ const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
157
+ const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
158
+
159
+ const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
160
+ const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
161
+ const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
162
+ const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
163
+
164
+ __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
165
+
166
+ const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
167
+ const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
168
+
169
+ acc = vec_madd(v_xy, v_d, acc);
170
+ }
171
+
172
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
173
+
174
+ #endif
175
+ for (; ib < nb; ++ib) {
176
+ int sumi0 = 0;
177
+ int sumi1 = 0;
178
+
179
+ for (int j = 0; j < qk/2; ++j) {
180
+ const int v0 = (x[ib].qs[j] & 0x0F) - 8;
181
+ const int v1 = (x[ib].qs[j] >> 4) - 8;
182
+
183
+ sumi0 += (v0 * y[ib].qs[j]);
184
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
185
+ }
186
+
187
+ int sumi = sumi0 + sumi1;
188
+ sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
189
+ }
190
+
191
+ *s = sumf;
192
+ }
193
+
194
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
195
+ const int qk = QK8_1;
196
+ const int nb = n / qk;
197
+
198
+ assert(n % qk == 0);
199
+ assert(nrc == 1);
200
+ UNUSED(nrc);
201
+ UNUSED(bx);
202
+ UNUSED(by);
203
+ UNUSED(bs);
204
+
205
+ const block_q4_1 * GGML_RESTRICT x = vx;
206
+ const block_q8_1 * GGML_RESTRICT y = vy;
207
+
208
+ int ib = 0;
209
+ float sumf = 0;
210
+
211
+ #if defined(__VXE__) || defined(__VXE2__)
212
+ float summs = 0;
213
+ float32x4_t acc = vec_splats(0.0f);
214
+
215
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
216
+
217
+ #pragma GCC unroll 4
218
+ for (; ib < nb; ++ib) {
219
+ __builtin_prefetch(x[ib].qs, 0, 1);
220
+ __builtin_prefetch(y[ib].qs, 0, 1);
221
+
222
+ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
223
+
224
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
225
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
226
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
227
+
228
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
229
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
230
+
231
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
232
+ const float32x4_t v_xy = vec_float(v_xy_);
233
+
234
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
235
+
236
+ acc = vec_madd(v_xy, v_d, acc);
237
+ }
238
+
239
+ sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
240
+
241
+ #endif
242
+ for (; ib < nb; ++ib) {
243
+ int sumi0 = 0;
244
+ int sumi1 = 0;
245
+
246
+ for (int j = 0; j < qk/2; ++j) {
247
+ const int v0 = (x[ib].qs[j] & 0x0F);
248
+ const int v1 = (x[ib].qs[j] >> 4);
249
+
250
+ sumi0 += (v0 * y[ib].qs[j]);
251
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
252
+ }
253
+
254
+ int sumi = sumi0 + sumi1;
255
+ sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
256
+ }
257
+
258
+ *s = sumf;
259
+ }
260
+
261
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
262
+ const int qk = QK8_0;
263
+ const int nb = n / qk;
264
+
265
+ assert(n % qk == 0);
266
+ assert(nrc == 1);
267
+ UNUSED(nrc);
268
+ UNUSED(bx);
269
+ UNUSED(by);
270
+ UNUSED(bs);
271
+
272
+ const block_q8_0 * GGML_RESTRICT x = vx;
273
+ const block_q8_0 * GGML_RESTRICT y = vy;
274
+
275
+ int ib = 0;
276
+ float sumf = 0;
277
+
278
+ #if defined(__VXE__) || defined(__VXE2__)
279
+ __vector float acc = vec_splats(0.0f);
280
+
281
+ #pragma GCC unroll 8
282
+ for (; ib < nb; ++ib) {
283
+ __builtin_prefetch(x[ib].qs, 0, 1);
284
+ __builtin_prefetch(y[ib].qs, 0, 1);
285
+
286
+ const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
287
+ const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
288
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
289
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
290
+
291
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
292
+ const float32x4_t v_xy = vec_float(v_xy_);
293
+ const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
294
+
295
+ acc = vec_madd(v_xy, v_d, acc);
296
+ }
297
+
298
+ sumf = acc[0] + acc[1] + acc[2] + acc[3];
299
+
300
+ #endif
301
+ for (; ib < nb; ++ib) {
302
+ int sumi = 0;
303
+
304
+ for (int j = 0; j < qk; j++) {
305
+ sumi += x[ib].qs[j]*y[ib].qs[j];
306
+ }
307
+
308
+ sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
309
+ }
310
+
311
+ *s = sumf;
312
+ }
313
+
314
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
315
+ assert(n % QK_K == 0);
316
+ assert(nrc == 1);
317
+ UNUSED(nrc);
318
+ UNUSED(bx);
319
+ UNUSED(by);
320
+ UNUSED(bs);
321
+
322
+ const uint32_t kmask1 = 0x03030303;
323
+ const uint32_t kmask2 = 0x0f0f0f0f;
324
+
325
+ const block_q3_K * GGML_RESTRICT x = vx;
326
+ const block_q8_K * GGML_RESTRICT y = vy;
327
+
328
+ const int nb = n / QK_K;
329
+
330
+ #if defined(__VXE__) || defined(__VXE2__)
331
+ uint32_t aux[3];
332
+ uint32_t utmp[4];
333
+
334
+ const int32x4_t v_z = vec_splat_s32(0);
335
+ const uint8x16_t v_3m = vec_splat_u8(0x03);
336
+
337
+ const uint8x16_t v_0c = vec_splat_u8(1);
338
+ const uint8x16_t v_1c = vec_sl(v_0c, 1);
339
+ const uint8x16_t v_2c = vec_sl(v_0c, 2);
340
+ const uint8x16_t v_3c = vec_sl(v_0c, 3);
341
+
342
+ uint8x16_t q3h[4];
343
+ uint8x16_t q3b[2];
344
+ int8x16_t q3bytes[4];
345
+ int8x16_t q8bytes[4];
346
+ uint8x16_t qhbits[2];
347
+
348
+ float sum = 0;
349
+
350
+ for (int i = 0; i < nb; ++i) {
351
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
352
+
353
+ const uint8_t * restrict x0l = x[i].qs;
354
+ const uint8_t * restrict x0h = x[i].hmask;
355
+ const int8_t * restrict y0 = y[i].qs;
356
+
357
+ qhbits[0] = vec_xl(0 , x0h);
358
+ qhbits[1] = vec_xl(16, x0h);
359
+
360
+ int32_t isum = 0;
361
+
362
+ memcpy(aux, x[i].scales, 12);
363
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
364
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
365
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
366
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
367
+
368
+ int8_t * scale = (int8_t *)utmp;
369
+ for (int j = 0; j < 16; ++j) scale[j] -= 32;
370
+
371
+ for (int j = 0; j < QK_K/128; ++j) {
372
+ int32x4_t isum0, isum1, isum2, isum3;
373
+
374
+ q3b[0] = vec_xl(0 , x0l);
375
+ q3b[1] = vec_xl(16, x0l);
376
+ x0l += 32;
377
+
378
+ q8bytes[0] = vec_xl(0 , y0);
379
+ q8bytes[1] = vec_xl(16 , y0);
380
+ q8bytes[2] = vec_xl(32 , y0);
381
+ q8bytes[3] = vec_xl(48 , y0);
382
+ q8bytes[4] = vec_xl(64 , y0);
383
+ q8bytes[5] = vec_xl(80 , y0);
384
+ q8bytes[6] = vec_xl(96 , y0);
385
+ q8bytes[7] = vec_xl(112, y0);
386
+ y0 += 128;
387
+
388
+ q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
389
+ q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
390
+ q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
391
+ q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
392
+
393
+ q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
394
+ q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
395
+ q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
396
+ q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
397
+
398
+ isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
399
+ isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
400
+ isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
401
+ isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
402
+
403
+ isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
404
+ isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
405
+ isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
406
+ isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
407
+
408
+ scale += 4;
409
+
410
+ q3h[0] = vec_andc(v_2c, qhbits[0]);
411
+ q3h[1] = vec_andc(v_2c, qhbits[1]);
412
+ q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
413
+ q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
414
+
415
+ q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
416
+ q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
417
+ q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
418
+ q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
419
+
420
+ isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
421
+ isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
422
+ isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
423
+ isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
424
+
425
+ isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
426
+ isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
427
+ isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
428
+ isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
429
+
430
+ scale += 4;
431
+
432
+ if (j == 0) {
433
+ qhbits[0] = vec_sr(qhbits[0], 4);
434
+ qhbits[1] = vec_sr(qhbits[1], 4);
435
+ }
436
+ }
437
+
438
+ sum += d * isum;
439
+ }
440
+
441
+ *s = sum;
442
+
443
+ #else
444
+ // scalar version
445
+ // This function is written like this so the compiler can manage to vectorize most of it
446
+ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
447
+ // manually vectorized version above. Every other version I tried would run at least 4 times slower.
448
+ // The ideal situation would be if we could just write the code once, and the compiler would
449
+ // automatically produce the best possible set of machine instructions, instead of us having to manually
450
+ // write vectorized versions for AVX, ARM_NEON, etc.
451
+
452
+ int8_t aux8[QK_K];
453
+ int16_t aux16[8];
454
+ float sums [8];
455
+ int32_t aux32[8];
456
+ memset(sums, 0, 8*sizeof(float));
457
+
458
+ uint32_t auxs[4];
459
+ const int8_t * scales = (const int8_t*)auxs;
460
+
461
+ float sumf = 0;
462
+ for (int i = 0; i < nb; ++i) {
463
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
464
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
465
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
466
+ memset(aux32, 0, 8*sizeof(int32_t));
467
+ int8_t * GGML_RESTRICT a = aux8;
468
+ uint8_t m = 1;
469
+ for (int j = 0; j < QK_K; j += 128) {
470
+ for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
471
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
472
+ a += 32; m <<= 1;
473
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
474
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
475
+ a += 32; m <<= 1;
476
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
477
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
478
+ a += 32; m <<= 1;
479
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
480
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
481
+ a += 32; m <<= 1;
482
+ q3 += 32;
483
+ }
484
+ a = aux8;
485
+
486
+ memcpy(auxs, x[i].scales, 12);
487
+ uint32_t tmp = auxs[2];
488
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
489
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
490
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
491
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
492
+ for (int j = 0; j < QK_K/16; ++j) {
493
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
494
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
495
+ q8 += 8; a += 8;
496
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
497
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
498
+ q8 += 8; a += 8;
499
+ }
500
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
501
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
502
+ }
503
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
504
+ *s = sumf;
505
+
506
+ #endif
507
+
508
+ }
509
+
510
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
511
+ assert(n % QK_K == 0);
512
+ assert(nrc == 1);
513
+ UNUSED(nrc);
514
+ UNUSED(bx);
515
+ UNUSED(by);
516
+ UNUSED(bs);
517
+
518
+ const block_q4_K * GGML_RESTRICT x = vx;
519
+ const block_q8_K * GGML_RESTRICT y = vy;
520
+
521
+ const int nb = n / QK_K;
522
+
523
+ static const uint32_t kmask1 = 0x3f3f3f3f;
524
+ static const uint32_t kmask2 = 0x0f0f0f0f;
525
+ static const uint32_t kmask3 = 0x03030303;
526
+
527
+ uint32_t utmp[4];
528
+
529
+ #if defined(__VXE__) || defined(__VXE2__)
530
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
531
+ const int32x4_t v_z = vec_splat_s32(0);
532
+
533
+ uint8x16_t v_x[2];
534
+ int8x16_t v_xl[2];
535
+ int8x16_t v_y[2];
536
+
537
+ float sumf = 0;
538
+
539
+ for (int i = 0; i < nb; ++i) {
540
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
541
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
542
+
543
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
544
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
545
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
546
+
547
+ memcpy(utmp, x[i].scales, 12);
548
+
549
+ uint32x4_t v_mins8 = { 0 };
550
+ v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
551
+ v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
552
+
553
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
554
+ utmp[0] &= kmask1;
555
+
556
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
557
+
558
+ const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
559
+ const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
560
+ const int32x4_t v_mins = v_minso + v_minse;
561
+ sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
562
+
563
+ const uint8_t * scales = (const uint8_t *)utmp;
564
+ const uint8_t * GGML_RESTRICT x0 = x[i].qs;
565
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
566
+
567
+ int32_t sumi1 = 0;
568
+ int32_t sumi2 = 0;
569
+
570
+ for (int j = 0; j < QK_K/64; ++j) {
571
+ v_x[0] = vec_xl(0 , x0);
572
+ v_x[1] = vec_xl(16, x0);
573
+ x0 += 32;
574
+
575
+ v_y[0] = vec_xl(0 , y0);
576
+ v_y[1] = vec_xl(16, y0);
577
+ y0 += 32;
578
+
579
+ v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
580
+ v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
581
+
582
+ const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
583
+ sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
584
+
585
+ v_y[0] = vec_xl(0 , y0);
586
+ v_y[1] = vec_xl(16, y0);
587
+ y0 += 32;
588
+
589
+ v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
590
+ v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
591
+
592
+ const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
593
+ sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
594
+ }
595
+
596
+ sumf += d * (sumi1 + sumi2);
597
+ }
598
+
599
+ *s = sumf;
600
+
601
+ #else
602
+
603
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
604
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
605
+
606
+ int8_t aux8[QK_K];
607
+ int16_t aux16[8];
608
+ float sums [8];
609
+ int32_t aux32[8];
610
+ memset(sums, 0, 8*sizeof(float));
611
+
612
+ float sumf = 0;
613
+ for (int i = 0; i < nb; ++i) {
614
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
615
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
616
+ memset(aux32, 0, 8*sizeof(int32_t));
617
+ int8_t * GGML_RESTRICT a = aux8;
618
+ for (int j = 0; j < QK_K/64; ++j) {
619
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
620
+ a += 32;
621
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
622
+ a += 32; q4 += 32;
623
+ }
624
+ memcpy(utmp, x[i].scales, 12);
625
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
626
+ const uint32_t uaux = utmp[1] & kmask1;
627
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
628
+ utmp[2] = uaux;
629
+ utmp[0] &= kmask1;
630
+
631
+ int sumi = 0;
632
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
633
+ a = aux8;
634
+ int is = 0;
635
+ for (int j = 0; j < QK_K/32; ++j) {
636
+ int32_t scale = scales[is++];
637
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
638
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
639
+ q8 += 8; a += 8;
640
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
641
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
642
+ q8 += 8; a += 8;
643
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
644
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
645
+ q8 += 8; a += 8;
646
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
647
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
648
+ q8 += 8; a += 8;
649
+ }
650
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
651
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
652
+ const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
653
+ sumf -= dmin * sumi;
654
+ }
655
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
656
+ *s = sumf;
657
+ #endif
658
+ }
659
+
660
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
661
+ assert(n % QK_K == 0);
662
+ assert(nrc == 1);
663
+ UNUSED(nrc);
664
+ UNUSED(bx);
665
+ UNUSED(by);
666
+ UNUSED(bs);
667
+
668
+ const block_q5_K * GGML_RESTRICT x = vx;
669
+ const block_q8_K * GGML_RESTRICT y = vy;
670
+
671
+ const int nb = n / QK_K;
672
+
673
+ static const uint32_t kmask1 = 0x3f3f3f3f;
674
+ static const uint32_t kmask2 = 0x0f0f0f0f;
675
+ static const uint32_t kmask3 = 0x03030303;
676
+
677
+ uint32_t utmp[4];
678
+
679
+ #if defined(__VXE__) || defined(__VXE2__)
680
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
681
+ const uint8x16_t v_1m = vec_splat_u8(0x01);
682
+ const uint8x16_t v_2m = vec_splat_u8(0x02);
683
+
684
+ const int32x4_t v_z = vec_splat_s32(0);
685
+
686
+ const uchar8x16_t v_minsm = {
687
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
688
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
689
+ };
690
+
691
+ int8x16_t q5b[4];
692
+ uint8x16_t q5h[4];
693
+
694
+ uint8x16_t v_xl[2];
695
+ uint8x16_t v_xh[2];
696
+ int8x16_t v_y[4];
697
+
698
+ float sumf = 0;
699
+
700
+ for (int i = 0; i < nb; ++i) {
701
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
702
+ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
703
+
704
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
705
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
706
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
707
+
708
+ memcpy(utmp, x[i].scales, 12);
709
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
710
+ const uint32_t uaux = utmp[1] & kmask1;
711
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
712
+ utmp[2] = uaux;
713
+ utmp[0] &= kmask1;
714
+
715
+ const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
716
+ const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
717
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
718
+
719
+ const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
720
+ const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
721
+ const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
722
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
723
+
724
+ const uint8_t * scales = (const uint8_t *)utmp;
725
+ const uint8_t * GGML_RESTRICT x0l = x[i].qs;
726
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
727
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
728
+
729
+ v_xh[0] = vec_xl(0 , x0h);
730
+ v_xh[1] = vec_xl(16, x0h);
731
+
732
+ int32_t sumi = 0;
733
+ for (int j = 0; j < QK_K/64; ++j) {
734
+ v_xl[0] = vec_xl(0 , x0l);
735
+ v_xl[1] = vec_xl(16, x0l);
736
+ x0l += 32;
737
+
738
+ v_y[0] = vec_xl(0 , y0);
739
+ v_y[1] = vec_xl(16, y0);
740
+ v_y[2] = vec_xl(32, y0);
741
+ v_y[3] = vec_xl(48, y0);
742
+ y0 += 64;
743
+
744
+ q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
745
+ q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
746
+ q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
747
+ q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
748
+ v_xh[0] = vec_sr(v_xh[0], 2);
749
+ v_xh[1] = vec_sr(v_xh[1], 2);
750
+
751
+ q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
752
+ q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
753
+ q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
754
+ q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
755
+
756
+ int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
757
+ int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
758
+
759
+ sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
760
+ sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
761
+ }
762
+
763
+ sumf += d * sumi - dmin * mins;
764
+ }
765
+
766
+ *s = sumf;
767
+
768
+ #else
769
+
770
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
771
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
772
+
773
+ int8_t aux8[QK_K];
774
+ int16_t aux16[8];
775
+ float sums [8];
776
+ int32_t aux32[8];
777
+ memset(sums, 0, 8*sizeof(float));
778
+
779
+ float sumf = 0;
780
+ for (int i = 0; i < nb; ++i) {
781
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
782
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
783
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
784
+ memset(aux32, 0, 8*sizeof(int32_t));
785
+ int8_t * GGML_RESTRICT a = aux8;
786
+ uint8_t m = 1;
787
+ for (int j = 0; j < QK_K/64; ++j) {
788
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
789
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
790
+ a += 32; m <<= 1;
791
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
792
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
793
+ a += 32; m <<= 1;
794
+ q4 += 32;
795
+ }
796
+ memcpy(utmp, x[i].scales, 12);
797
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
798
+ const uint32_t uaux = utmp[1] & kmask1;
799
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
800
+ utmp[2] = uaux;
801
+ utmp[0] &= kmask1;
802
+
803
+ int sumi = 0;
804
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
805
+ a = aux8;
806
+ int is = 0;
807
+ for (int j = 0; j < QK_K/32; ++j) {
808
+ int32_t scale = scales[is++];
809
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
810
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
811
+ q8 += 8; a += 8;
812
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
813
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
814
+ q8 += 8; a += 8;
815
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
816
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
817
+ q8 += 8; a += 8;
818
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
819
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
820
+ q8 += 8; a += 8;
821
+ }
822
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
823
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
824
+ const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
825
+ sumf -= dmin * sumi;
826
+ }
827
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
828
+ *s = sumf;
829
+ #endif
830
+ }
831
+
832
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
833
+ assert(n % QK_K == 0);
834
+ assert(nrc == 1);
835
+ UNUSED(nrc);
836
+ UNUSED(bx);
837
+ UNUSED(by);
838
+ UNUSED(bs);
839
+
840
+ const block_q6_K * GGML_RESTRICT x = vx;
841
+ const block_q8_K * GGML_RESTRICT y = vy;
842
+
843
+ const int nb = n / QK_K;
844
+
845
+ #if defined(__VXE__) || defined(__VXE2__)
846
+ float sum = 0;
847
+
848
+ // Lower 4-bit and upper 2-bit masks
849
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
850
+ const uint8x16_t v_um = vec_splat_u8(0x03);
851
+
852
+ const int32x4_t v_z = vec_splat_s32(0);
853
+
854
+ int8x16_t q6b[4];
855
+ uint8x16_t q6h[4];
856
+
857
+ uint8x16_t v_xl[4];
858
+ uint8x16_t v_xh[2];
859
+ int8x16_t v_y[4];
860
+
861
+ for (int i = 0; i < nb; ++i) {
862
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
863
+
864
+ const uint8_t * GGML_RESTRICT x0l = x[i].ql;
865
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
866
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
867
+
868
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
869
+
870
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
871
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
872
+
873
+ const int8x16_t v_scale = vec_xl(0, scale);
874
+ const int16x8_t v_scalel = vec_unpackh(v_scale);
875
+ const int16x8_t v_scaleh = vec_unpackl(v_scale);
876
+
877
+ const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
878
+ const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
879
+ const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
880
+ const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
881
+ const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
882
+
883
+ const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
884
+
885
+ int32_t isum = 0;
886
+ for (int j = 0; j < QK_K/128; ++j) {
887
+ // Load model upper 2 bits
888
+ v_xh[0] = vec_xl(0 , x0h);
889
+ v_xh[1] = vec_xl(16, x0h);
890
+ x0h += 32;
891
+
892
+ // Load model lower 4 bits
893
+ v_xl[0] = vec_xl(0 , x0l);
894
+ v_xl[1] = vec_xl(16, x0l);
895
+ v_xl[2] = vec_xl(32, x0l);
896
+ v_xl[3] = vec_xl(48, x0l);
897
+ x0l += 64;
898
+
899
+ // Load activation quants
900
+ v_y[0] = vec_xl(0 , y0);
901
+ v_y[1] = vec_xl(16, y0);
902
+ v_y[2] = vec_xl(32, y0);
903
+ v_y[3] = vec_xl(48, y0);
904
+ y0 += 64;
905
+
906
+ q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
907
+ q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
908
+ uint8x16_t shifted = vec_sr(v_xh[0], 2);
909
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
910
+ shifted = vec_sr(v_xh[1], 2);
911
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
912
+
913
+ q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
914
+ q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
915
+ q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
916
+ q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
917
+
918
+ int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
919
+ int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
920
+ int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
921
+ int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
922
+
923
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
924
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
925
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
926
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
927
+
928
+ scale += 4;
929
+
930
+
931
+ // Load activation quants
932
+ v_y[0] = vec_xl(0 , y0);
933
+ v_y[1] = vec_xl(16, y0);
934
+ v_y[2] = vec_xl(32, y0);
935
+ v_y[3] = vec_xl(48, y0);
936
+ y0 += 64;
937
+
938
+ shifted = vec_sr(v_xh[0], 4);
939
+ q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
940
+ shifted = vec_sr(v_xh[1], 4);
941
+ q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
942
+ shifted = vec_sr(v_xh[0], 6);
943
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
944
+ shifted = vec_sr(v_xh[1], 6);
945
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
946
+
947
+ q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
948
+ q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
949
+ q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
950
+ q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
951
+
952
+ summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
953
+ summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
954
+ summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
955
+ summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
956
+
957
+ isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
958
+ (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
959
+ (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
960
+ (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
961
+
962
+ scale += 4;
963
+ }
964
+
965
+ sum += d_all * y[i].d * (isum - 32 * mins);
966
+ }
967
+
968
+ *s = sum;
969
+
970
+ #else
971
+
972
+ int8_t aux8[QK_K];
973
+ int16_t aux16[8];
974
+ float sums [8];
975
+ int32_t aux32[8];
976
+ memset(sums, 0, 8*sizeof(float));
977
+
978
+ float sumf = 0;
979
+ for (int i = 0; i < nb; ++i) {
980
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
981
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
982
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
983
+ memset(aux32, 0, 8*sizeof(int32_t));
984
+ int8_t * GGML_RESTRICT a = aux8;
985
+ for (int j = 0; j < QK_K; j += 128) {
986
+ for (int l = 0; l < 32; ++l) {
987
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
988
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
989
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
990
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
991
+ }
992
+ a += 128;
993
+ q4 += 64;
994
+ qh += 32;
995
+ }
996
+ a = aux8;
997
+ int is = 0;
998
+ for (int j = 0; j < QK_K/16; ++j) {
999
+ int scale = x[i].scales[is++];
1000
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1001
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1002
+ q8 += 8; a += 8;
1003
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1004
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1005
+ q8 += 8; a += 8;
1006
+ }
1007
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1008
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1009
+ }
1010
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1011
+ *s = sumf;
1012
+ #endif
1013
+ }
1014
+
1015
+ // #if defined(__VXE__) || defined(__VXE2__)
1016
+ // static const int8_t keven_signs_q2xs[1024] = {
1017
+ // 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
1018
+ // 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
1019
+ // 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
1020
+ // 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
1021
+ // 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
1022
+ // 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
1023
+ // 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
1024
+ // 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
1025
+ // 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
1026
+ // 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
1027
+ // 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
1028
+ // 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
1029
+ // 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
1030
+ // 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
1031
+ // 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
1032
+ // 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
1033
+ // 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
1034
+ // 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
1035
+ // 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
1036
+ // 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
1037
+ // 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
1038
+ // 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
1039
+ // 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
1040
+ // 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
1041
+ // 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
1042
+ // 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
1043
+ // 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
1044
+ // 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
1045
+ // 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
1046
+ // 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
1047
+ // 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
1048
+ // 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
1049
+ // };
1050
+ // #endif
1051
+
1052
+ // void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1053
+ // assert(n % QK_K == 0);
1054
+ // assert(nrc == 1);
1055
+ // UNUSED(nrc);
1056
+ // UNUSED(bx);
1057
+ // UNUSED(by);
1058
+ // UNUSED(bs);
1059
+
1060
+ // const block_iq2_xxs * GGML_RESTRICT x = vx;
1061
+ // const block_q8_K * GGML_RESTRICT y = vy;
1062
+
1063
+ // const int nb = n / QK_K;
1064
+
1065
+ // #if defined(__VXE__) || defined(__VXE2__)
1066
+ // const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1067
+
1068
+ // uint32_t aux32[4];
1069
+ // const uint8_t * aux8 = (const uint8_t *)aux32;
1070
+
1071
+ // float sumf = 0;
1072
+
1073
+ // for (int i = 0; i < nb; ++i) {
1074
+ // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1075
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1076
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
1077
+
1078
+ // float sumf1 = 0, sumf2 = 0;
1079
+
1080
+ // for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
1081
+ // int8x16_t q8b0 = vec_xl( 0, q8);
1082
+ // int8x16_t qb81 = vec_xl(16, q8);
1083
+ // int8x16_t q8b2 = vec_xl(32, q8);
1084
+ // int8x16_t q8b3 = vec_xl(48, q8);
1085
+ // q8 += 64;
1086
+
1087
+ // memcpy(aux32, q2, 4 * sizeof(uint32_t));
1088
+ // q2 += 8;
1089
+
1090
+ // int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
1091
+ // int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
1092
+ // int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
1093
+ // int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
1094
+
1095
+ // int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
1096
+ // int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
1097
+ // int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
1098
+ // int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
1099
+
1100
+ // q2u0 = vec_mul(q2u0, q2s0);
1101
+ // q2u1 = vec_mul(q2u1, q2s1);
1102
+ // q2u2 = vec_mul(q2u2, q2s2);
1103
+ // q2u3 = vec_mul(q2u3, q2s3);
1104
+
1105
+ // const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
1106
+ // const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
1107
+
1108
+ // sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
1109
+ // sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
1110
+ // }
1111
+
1112
+ // sumf += d * (sumf1 + sumf2);
1113
+ // }
1114
+
1115
+ // *s = 0.25f * sumf;
1116
+
1117
+ // #else
1118
+
1119
+ // uint32_t aux32[2];
1120
+ // const uint8_t * aux8 = (const uint8_t *)aux32;
1121
+
1122
+ // float sumf = 0.f;
1123
+ // for (int i = 0; i < nb; ++i) {
1124
+ // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1125
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1126
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
1127
+ // int32_t bsum = 0;
1128
+ // for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1129
+ // memcpy(aux32, q2, 2*sizeof(uint32_t));
1130
+ // q2 += 4;
1131
+ // const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1132
+ // int32_t sumi = 0;
1133
+ // for (int l = 0; l < 4; ++l) {
1134
+ // const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1135
+ // const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1136
+ // for (int j = 0; j < 8; ++j) {
1137
+ // sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1138
+ // }
1139
+ // q8 += 8;
1140
+ // }
1141
+ // bsum += sumi * ls;
1142
+ // }
1143
+ // sumf += d * bsum;
1144
+ // }
1145
+ // *s = 0.125f * sumf;
1146
+ // #endif
1147
+ // }
1148
+
1149
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1150
+ assert(nrc == 1);
1151
+ UNUSED(nrc);
1152
+ UNUSED(bx);
1153
+ UNUSED(by);
1154
+ UNUSED(bs);
1155
+ assert(n % QK4_NL == 0);
1156
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1157
+
1158
+ const block_iq4_nl * GGML_RESTRICT x = vx;
1159
+ const block_q8_0 * GGML_RESTRICT y = vy;
1160
+
1161
+ const int nb = n / QK4_NL;
1162
+
1163
+ int ib = 0;
1164
+ float sumf = 0;
1165
+
1166
+ #if defined(__VXE__) || defined(__VXE2__)
1167
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1168
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
1169
+
1170
+ for (; ib < nb; ++ib) {
1171
+ const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
1172
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
1173
+
1174
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
1175
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
1176
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
1177
+
1178
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
1179
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
1180
+
1181
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
1182
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1183
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1184
+
1185
+ sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1186
+ }
1187
+
1188
+ #endif
1189
+ for (; ib < nb; ++ib) {
1190
+ const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
1191
+ int sumi1 = 0, sumi2 = 0;
1192
+ for (int j = 0; j < QK4_NL/2; ++j) {
1193
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
1194
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
1195
+ }
1196
+ sumf += d * (sumi1 + sumi2);
1197
+ }
1198
+ *s = sumf;
1199
+ }
1200
+
1201
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1202
+ assert(nrc == 1);
1203
+ UNUSED(nrc);
1204
+ UNUSED(bx);
1205
+ UNUSED(by);
1206
+ UNUSED(bs);
1207
+ assert(n % QK_K == 0);
1208
+
1209
+ const block_iq4_xs * GGML_RESTRICT x = vx;
1210
+ const block_q8_K * GGML_RESTRICT y = vy;
1211
+
1212
+ const int nb = n / QK_K;
1213
+
1214
+ #if defined(__VXE__) || defined(__VXE2__)
1215
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1216
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
1217
+
1218
+ float sumf = 0;
1219
+
1220
+ for (int ibl = 0; ibl < nb; ++ibl) {
1221
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
1222
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
1223
+
1224
+ uint16_t h = x[ibl].scales_h;
1225
+
1226
+ int sumi1 = 0, sumi2 = 0;
1227
+ for (int ib = 0; ib < QK_K/64; ++ib) {
1228
+ const uint8x16_t v_x0 = vec_xl(0 , q4);
1229
+ const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
1230
+ q4 += 32;
1231
+
1232
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
1233
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
1234
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
1235
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
1236
+
1237
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
1238
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
1239
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
1240
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
1241
+
1242
+ const int8x16_t v_y0 = vec_xl( 0, q8);
1243
+ const int8x16_t v_y1 = vec_xl(16, q8);
1244
+ const int8x16_t v_y2 = vec_xl(32, q8);
1245
+ const int8x16_t v_y3 = vec_xl(48, q8);
1246
+ q8 += 64;
1247
+
1248
+ int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
1249
+ int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
1250
+
1251
+ int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
1252
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
1253
+
1254
+ h >>= 4;
1255
+
1256
+ sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
1257
+ sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
1258
+ }
1259
+
1260
+ sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
1261
+ }
1262
+
1263
+ *s = sumf;
1264
+
1265
+ #else
1266
+ float sumf = 0;
1267
+ for (int ibl = 0; ibl < nb; ++ibl) {
1268
+ const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1269
+ uint16_t h = x[ibl].scales_h;
1270
+ const uint8_t * qs = x[ibl].qs;
1271
+ const int8_t * q8 = y[ibl].qs;
1272
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
1273
+ const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
1274
+ const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
1275
+ h >>= 4;
1276
+ const float d1 = d4d8*(ls1 - 32);
1277
+ const float d2 = d4d8*(ls2 - 32);
1278
+ int sumi1 = 0, sumi2 = 0;
1279
+ for (int j = 0; j < 16; ++j) {
1280
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1281
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1282
+ }
1283
+ sumf += d1 * (sumi1 + sumi2);
1284
+ qs += 16;
1285
+ q8 += 32;
1286
+ sumi1 = sumi2 = 0;
1287
+ for (int j = 0; j < 16; ++j) {
1288
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1289
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1290
+ }
1291
+ sumf += d2 * (sumi1 + sumi2);
1292
+ qs += 16;
1293
+ q8 += 32;
1294
+ }
1295
+ }
1296
+ *s = sumf;
1297
+ #endif
1298
+ }
1299
+