@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,1027 @@
1
+ #include "mmvq.hpp"
2
+ #include "vecdotq.hpp"
3
+
4
+
5
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
6
+ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
7
+ const sycl::nd_item<3> &item_ct1) {
8
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
9
+ item_ct1.get_local_id(1);
10
+
11
+ if (row >= nrows) {
12
+ return;
13
+ }
14
+
15
+ const int blocks_per_row = ncols / qk;
16
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
17
+
18
+ // partial sum for each thread
19
+ float tmp = 0.0f;
20
+
21
+ const block_q_t * x = (const block_q_t *) vx;
22
+ const block_q8_1 * y = (const block_q8_1 *) vy;
23
+
24
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
25
+ i += blocks_per_warp) {
26
+ const int ibx = row*blocks_per_row + i; // x block index
27
+
28
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
29
+
30
+ const int iqs =
31
+ vdr *
32
+ (item_ct1.get_local_id(2) %
33
+ (qi / vdr)); // x block quant index when casting the quants to int
34
+
35
+ tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
36
+ }
37
+
38
+ // sum up partial sums and write back result
39
+ #pragma unroll
40
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
41
+ tmp +=
42
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
43
+ }
44
+
45
+ if (item_ct1.get_local_id(2) == 0) {
46
+ dst[row] = tmp;
47
+ }
48
+ }
49
+
50
+ template <int qk, int qi, typename block_q_t, int vdr>
51
+ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
52
+ const void *__restrict__ vy,
53
+ float *__restrict__ dst, const int ncols,
54
+ const int nrows,
55
+ const sycl::nd_item<3> &item_ct1) {
56
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
57
+ item_ct1.get_local_id(1);
58
+
59
+ if (row >= nrows) {
60
+ return;
61
+ }
62
+
63
+ const int blocks_per_row = ncols / qk;
64
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
65
+
66
+ // partial sum for each thread
67
+ float tmp = 0.0f;
68
+
69
+ const block_q_t * x = (const block_q_t *) vx;
70
+ const block_q8_1 * y = (const block_q8_1 *) vy;
71
+
72
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
73
+ i += blocks_per_warp) {
74
+ const int ibx = row*blocks_per_row + i; // x block index
75
+
76
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
77
+
78
+ const int iqs =
79
+ vdr *
80
+ (item_ct1.get_local_id(2) %
81
+ (qi / vdr)); // x block quant index when casting the quants to int
82
+
83
+ tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
84
+ }
85
+
86
+ // sum up partial sums and write back result
87
+ #pragma unroll
88
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
89
+ tmp +=
90
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
91
+ }
92
+
93
+ if (item_ct1.get_local_id(2) == 0) {
94
+ dst[row] = tmp;
95
+ }
96
+ }
97
+
98
+ template <int qk, int qi, typename block_q_t, int vdr>
99
+ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
100
+ const void *__restrict__ vy,
101
+ float *__restrict__ dst, const int ncols,
102
+ const int nrows,
103
+ const sycl::nd_item<3> &item_ct1) {
104
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
105
+ item_ct1.get_local_id(1);
106
+
107
+ if (row >= nrows) {
108
+ return;
109
+ }
110
+
111
+ const int blocks_per_row = ncols / qk;
112
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
113
+
114
+ // partial sum for each thread
115
+ float tmp = 0.0f;
116
+
117
+ const block_q_t * x = (const block_q_t *) vx;
118
+ const block_q8_1 * y = (const block_q8_1 *) vy;
119
+
120
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
121
+ i += blocks_per_warp) {
122
+ const int ibx = row*blocks_per_row + i; // x block index
123
+
124
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
125
+
126
+ const int iqs =
127
+ vdr *
128
+ (item_ct1.get_local_id(2) %
129
+ (qi / vdr)); // x block quant index when casting the quants to int
130
+
131
+ tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
132
+ }
133
+
134
+ // sum up partial sums and write back result
135
+ #pragma unroll
136
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
137
+ tmp +=
138
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
139
+ }
140
+
141
+ if (item_ct1.get_local_id(2) == 0) {
142
+ dst[row] = tmp;
143
+ }
144
+ }
145
+
146
+ template <int qk, int qi, typename block_q_t, int vdr>
147
+ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
148
+ const void *__restrict__ vy,
149
+ float *__restrict__ dst, const int ncols,
150
+ const int nrows,
151
+ const sycl::nd_item<3> &item_ct1) {
152
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
153
+ item_ct1.get_local_id(1);
154
+
155
+ if (row >= nrows) {
156
+ return;
157
+ }
158
+
159
+ const int blocks_per_row = ncols / qk;
160
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
161
+
162
+ // partial sum for each thread
163
+ float tmp = 0.0f;
164
+
165
+ const block_q_t * x = (const block_q_t *) vx;
166
+ const block_q8_1 * y = (const block_q8_1 *) vy;
167
+
168
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
169
+ i += blocks_per_warp) {
170
+ const int ibx = row*blocks_per_row + i; // x block index
171
+
172
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
173
+
174
+ const int iqs =
175
+ vdr *
176
+ (item_ct1.get_local_id(2) %
177
+ (qi / vdr)); // x block quant index when casting the quants to int
178
+
179
+ tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
180
+ }
181
+
182
+ // sum up partial sums and write back result
183
+ #pragma unroll
184
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
185
+ tmp +=
186
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
187
+ }
188
+
189
+ if (item_ct1.get_local_id(2) == 0) {
190
+ dst[row] = tmp;
191
+ }
192
+ }
193
+
194
+ template <int qk, int qi, typename block_q_t, int vdr>
195
+ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
196
+ const void *__restrict__ vy,
197
+ float *__restrict__ dst, const int ncols,
198
+ const int nrows,
199
+ const sycl::nd_item<3> &item_ct1) {
200
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
201
+ item_ct1.get_local_id(1);
202
+
203
+ if (row >= nrows) {
204
+ return;
205
+ }
206
+
207
+ const int blocks_per_row = ncols / qk;
208
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
209
+
210
+ // partial sum for each thread
211
+ float tmp = 0.0f;
212
+
213
+ const block_q_t * x = (const block_q_t *) vx;
214
+ const block_q8_1 * y = (const block_q8_1 *) vy;
215
+
216
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
217
+ i += blocks_per_warp) {
218
+ const int ibx = row*blocks_per_row + i; // x block index
219
+
220
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
221
+
222
+ const int iqs =
223
+ vdr *
224
+ (item_ct1.get_local_id(2) %
225
+ (qi / vdr)); // x block quant index when casting the quants to int
226
+
227
+ tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
228
+ }
229
+
230
+ // sum up partial sums and write back result
231
+ #pragma unroll
232
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
233
+ tmp +=
234
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
235
+ }
236
+
237
+ if (item_ct1.get_local_id(2) == 0) {
238
+ dst[row] = tmp;
239
+ }
240
+ }
241
+
242
+ template <int qk, int qi, typename block_q_t, int vdr>
243
+ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
244
+ const void *__restrict__ vy,
245
+ float *__restrict__ dst, const int ncols,
246
+ const int nrows,
247
+ const sycl::nd_item<3> &item_ct1) {
248
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
249
+ item_ct1.get_local_id(1);
250
+
251
+ if (row >= nrows) {
252
+ return;
253
+ }
254
+
255
+ const int blocks_per_row = ncols / qk;
256
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
257
+
258
+ // partial sum for each thread
259
+ float tmp = 0.0f;
260
+
261
+ const block_q_t * x = (const block_q_t *) vx;
262
+ const block_q8_1 * y = (const block_q8_1 *) vy;
263
+
264
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
265
+ i += blocks_per_warp) {
266
+ const int ibx = row*blocks_per_row + i; // x block index
267
+
268
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
269
+
270
+ const int iqs =
271
+ vdr *
272
+ (item_ct1.get_local_id(2) %
273
+ (qi / vdr)); // x block quant index when casting the quants to int
274
+
275
+ tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
276
+ }
277
+
278
+ // sum up partial sums and write back result
279
+ #pragma unroll
280
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
281
+ tmp +=
282
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
283
+ }
284
+
285
+ if (item_ct1.get_local_id(2) == 0) {
286
+ dst[row] = tmp;
287
+ }
288
+ }
289
+
290
+ template <int qk, int qi, typename block_q_t, int vdr>
291
+ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
292
+ const void *__restrict__ vy,
293
+ float *__restrict__ dst, const int ncols,
294
+ const int nrows,
295
+ const sycl::nd_item<3> &item_ct1) {
296
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
297
+ item_ct1.get_local_id(1);
298
+
299
+ if (row >= nrows) {
300
+ return;
301
+ }
302
+
303
+ const int blocks_per_row = ncols / qk;
304
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
305
+
306
+ // partial sum for each thread
307
+ float tmp = 0.0f;
308
+
309
+ const block_q_t * x = (const block_q_t *) vx;
310
+ const block_q8_1 * y = (const block_q8_1 *) vy;
311
+
312
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
313
+ i += blocks_per_warp) {
314
+ const int ibx = row*blocks_per_row + i; // x block index
315
+
316
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
317
+
318
+ const int iqs =
319
+ vdr *
320
+ (item_ct1.get_local_id(2) %
321
+ (qi / vdr)); // x block quant index when casting the quants to int
322
+
323
+ tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
324
+ }
325
+
326
+ // sum up partial sums and write back result
327
+ #pragma unroll
328
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
329
+ tmp +=
330
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
331
+ }
332
+
333
+ if (item_ct1.get_local_id(2) == 0) {
334
+ dst[row] = tmp;
335
+ }
336
+ }
337
+
338
+ template <int qk, int qi, typename block_q_t, int vdr>
339
+ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
340
+ const void *__restrict__ vy,
341
+ float *__restrict__ dst, const int ncols,
342
+ const int nrows,
343
+ const sycl::nd_item<3> &item_ct1) {
344
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
345
+ item_ct1.get_local_id(1);
346
+
347
+ if (row >= nrows) {
348
+ return;
349
+ }
350
+
351
+ const int blocks_per_row = ncols / qk;
352
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
353
+
354
+ // partial sum for each thread
355
+ float tmp = 0.0f;
356
+
357
+ const block_q_t * x = (const block_q_t *) vx;
358
+ const block_q8_1 * y = (const block_q8_1 *) vy;
359
+
360
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
361
+ i += blocks_per_warp) {
362
+ const int ibx = row*blocks_per_row + i; // x block index
363
+
364
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
365
+
366
+ const int iqs =
367
+ vdr *
368
+ (item_ct1.get_local_id(2) %
369
+ (qi / vdr)); // x block quant index when casting the quants to int
370
+
371
+ tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
372
+ }
373
+
374
+ // sum up partial sums and write back result
375
+ #pragma unroll
376
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
377
+ tmp +=
378
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
379
+ }
380
+
381
+ if (item_ct1.get_local_id(2) == 0) {
382
+ dst[row] = tmp;
383
+ }
384
+ }
385
+
386
+ template <int qk, int qi, typename block_q_t, int vdr>
387
+ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
388
+ const void *__restrict__ vy,
389
+ float *__restrict__ dst, const int ncols,
390
+ const int nrows,
391
+ const sycl::nd_item<3> &item_ct1) {
392
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
393
+ item_ct1.get_local_id(1);
394
+
395
+ if (row >= nrows) {
396
+ return;
397
+ }
398
+
399
+ const int blocks_per_row = ncols / qk;
400
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
401
+
402
+ // partial sum for each thread
403
+ float tmp = 0.0f;
404
+
405
+ const block_q_t * x = (const block_q_t *) vx;
406
+ const block_q8_1 * y = (const block_q8_1 *) vy;
407
+
408
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
409
+ i += blocks_per_warp) {
410
+ const int ibx = row*blocks_per_row + i; // x block index
411
+
412
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
413
+
414
+ const int iqs =
415
+ vdr *
416
+ (item_ct1.get_local_id(2) %
417
+ (qi / vdr)); // x block quant index when casting the quants to int
418
+
419
+ tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
420
+ }
421
+
422
+ // sum up partial sums and write back result
423
+ #pragma unroll
424
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
425
+ tmp +=
426
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
427
+ }
428
+
429
+ if (item_ct1.get_local_id(2) == 0) {
430
+ dst[row] = tmp;
431
+ }
432
+ }
433
+
434
+
435
+ template <int qk, int qi, typename block_q_t, int vdr>
436
+ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
437
+ const void *__restrict__ vy,
438
+ float *__restrict__ dst, const int ncols,
439
+ const int nrows,
440
+ const sycl::nd_item<3> &item_ct1) {
441
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
442
+ item_ct1.get_local_id(1);
443
+
444
+ if (row >= nrows) {
445
+ return;
446
+ }
447
+
448
+ const int blocks_per_row = ncols / qk;
449
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
450
+
451
+ // partial sum for each thread
452
+ float tmp = 0.0f;
453
+
454
+ const block_q_t * x = (const block_q_t *) vx;
455
+ const block_q8_1 * y = (const block_q8_1 *) vy;
456
+
457
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
458
+ i += blocks_per_warp) {
459
+ const int ibx = row*blocks_per_row + i; // x block index
460
+
461
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
462
+
463
+ const int iqs =
464
+ vdr *
465
+ (item_ct1.get_local_id(2) %
466
+ (qi / vdr)); // x block quant index when casting the quants to int
467
+
468
+ tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
469
+ }
470
+
471
+ // sum up partial sums and write back result
472
+ #pragma unroll
473
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
474
+ tmp +=
475
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
476
+ }
477
+
478
+ if (item_ct1.get_local_id(2) == 0) {
479
+ dst[row] = tmp;
480
+ }
481
+ }
482
+
483
+ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
484
+ float *dst, const int ncols,
485
+ const int nrows,
486
+ dpct::queue_ptr stream) {
487
+ GGML_ASSERT(ncols % QK4_0 == 0);
488
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
489
+ const sycl::range<3> block_nums(1, 1, block_num_y);
490
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
491
+ {
492
+
493
+ stream->submit([&](sycl::handler &cgh) {
494
+
495
+ cgh.parallel_for(
496
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
+ [=](sycl::nd_item<3> item_ct1)
498
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
499
+ mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
+ VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
+ vx, vy, dst, ncols, nrows, item_ct1);
502
+ });
503
+ });
504
+ }
505
+ }
506
+
507
+ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
508
+ float *dst, const int ncols,
509
+ const int nrows,
510
+ dpct::queue_ptr stream) {
511
+ GGML_ASSERT(ncols % QK4_1 == 0);
512
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
513
+ const sycl::range<3> block_nums(1, 1, block_num_y);
514
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
515
+ {
516
+
517
+ stream->submit([&](sycl::handler &cgh) {
518
+
519
+ cgh.parallel_for(
520
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
521
+ [=](sycl::nd_item<3> item_ct1)
522
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
523
+ mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
524
+ VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
525
+ vx, vy, dst, ncols, nrows, item_ct1);
526
+ });
527
+ });
528
+ }
529
+ }
530
+
531
+ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
532
+ float *dst, const int ncols,
533
+ const int nrows,
534
+ dpct::queue_ptr stream) {
535
+ GGML_ASSERT(ncols % QK5_0 == 0);
536
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
537
+ const sycl::range<3> block_nums(1, 1, block_num_y);
538
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
539
+ {
540
+
541
+ stream->submit([&](sycl::handler &cgh) {
542
+
543
+ cgh.parallel_for(
544
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
545
+ [=](sycl::nd_item<3> item_ct1)
546
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
547
+ mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
548
+ VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
549
+ vx, vy, dst, ncols, nrows, item_ct1);
550
+ });
551
+ });
552
+ }
553
+ }
554
+
555
+ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
556
+ float *dst, const int ncols,
557
+ const int nrows,
558
+ dpct::queue_ptr stream) {
559
+ GGML_ASSERT(ncols % QK5_1 == 0);
560
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
561
+ const sycl::range<3> block_nums(1, 1, block_num_y);
562
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
563
+ {
564
+
565
+ stream->submit([&](sycl::handler &cgh) {
566
+
567
+ cgh.parallel_for(
568
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
569
+ [=](sycl::nd_item<3> item_ct1)
570
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
571
+ mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
572
+ VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
573
+ vx, vy, dst, ncols, nrows, item_ct1);
574
+ });
575
+ });
576
+ }
577
+ }
578
+
579
+ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
580
+ float *dst, const int ncols,
581
+ const int nrows,
582
+ dpct::queue_ptr stream) {
583
+ GGML_ASSERT(ncols % QK8_0 == 0);
584
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
585
+ const sycl::range<3> block_nums(1, 1, block_num_y);
586
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
587
+ {
588
+
589
+ stream->submit([&](sycl::handler &cgh) {
590
+
591
+ cgh.parallel_for(
592
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
593
+ [=](sycl::nd_item<3> item_ct1)
594
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
595
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
596
+ VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
597
+ vx, vy, dst, ncols, nrows, item_ct1);
598
+ });
599
+ });
600
+ }
601
+ }
602
+
603
+ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
604
+ float *dst, const int ncols,
605
+ const int nrows,
606
+ dpct::queue_ptr stream) {
607
+ GGML_ASSERT(ncols % QK_K == 0);
608
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
609
+ const sycl::range<3> block_nums(1, 1, block_num_y);
610
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
611
+ {
612
+
613
+ stream->submit([&](sycl::handler &cgh) {
614
+
615
+ cgh.parallel_for(
616
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
617
+ [=](sycl::nd_item<3> item_ct1)
618
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
619
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
620
+ VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
621
+ vx, vy, dst, ncols, nrows, item_ct1);
622
+ });
623
+ });
624
+ }
625
+ }
626
+
627
+ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
628
+ float *dst, const int ncols,
629
+ const int nrows,
630
+ dpct::queue_ptr stream) {
631
+ GGML_ASSERT(ncols % QK_K == 0);
632
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
633
+ const sycl::range<3> block_nums(1, 1, block_num_y);
634
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
635
+ {
636
+
637
+ stream->submit([&](sycl::handler &cgh) {
638
+
639
+ cgh.parallel_for(
640
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
641
+ [=](sycl::nd_item<3> item_ct1)
642
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
643
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
644
+ VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
645
+ vx, vy, dst, ncols, nrows, item_ct1);
646
+ });
647
+ });
648
+ }
649
+ }
650
+
651
+ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
652
+ float *dst, const int ncols,
653
+ const int nrows,
654
+ dpct::queue_ptr stream) {
655
+ GGML_ASSERT(ncols % QK_K == 0);
656
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
657
+ const sycl::range<3> block_nums(1, 1, block_num_y);
658
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
659
+ {
660
+
661
+ stream->submit([&](sycl::handler &cgh) {
662
+
663
+ cgh.parallel_for(
664
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
665
+ [=](sycl::nd_item<3> item_ct1)
666
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
667
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
668
+ VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
669
+ vx, vy, dst, ncols, nrows, item_ct1);
670
+ });
671
+ });
672
+ }
673
+ }
674
+
675
+ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
676
+ float *dst, const int ncols,
677
+ const int nrows,
678
+ dpct::queue_ptr stream) {
679
+ GGML_ASSERT(ncols % QK_K == 0);
680
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
681
+ const sycl::range<3> block_nums(1, 1, block_num_y);
682
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
683
+ {
684
+
685
+ stream->submit([&](sycl::handler &cgh) {
686
+
687
+ cgh.parallel_for(
688
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
689
+ [=](sycl::nd_item<3> item_ct1)
690
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
691
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
692
+ VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
693
+ vx, vy, dst, ncols, nrows, item_ct1);
694
+ });
695
+ });
696
+ }
697
+ }
698
+
699
+ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
700
+ float *dst, const int ncols,
701
+ const int nrows,
702
+ dpct::queue_ptr stream) {
703
+ GGML_ASSERT(ncols % QK_K == 0);
704
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
705
+ const sycl::range<3> block_nums(1, 1, block_num_y);
706
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
707
+ {
708
+
709
+ stream->submit([&](sycl::handler &cgh) {
710
+
711
+ cgh.parallel_for(
712
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
713
+ [=](sycl::nd_item<3> item_ct1)
714
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
715
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
716
+ VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
717
+ vx, vy, dst, ncols, nrows, item_ct1);
718
+ });
719
+ });
720
+ }
721
+ }
722
+
723
+
724
+ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
725
+ float *dst, const int ncols,
726
+ const int nrows,
727
+ dpct::queue_ptr stream) {
728
+ GGML_ASSERT(ncols % QK_K == 0);
729
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
730
+ const sycl::range<3> block_nums(1, 1, block_num_y);
731
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
732
+ {
733
+ stream->submit([&](sycl::handler &cgh) {
734
+ cgh.parallel_for(
735
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
736
+ [=](sycl::nd_item<3> item_ct1)
737
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
738
+ mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
739
+ vx, vy, dst, ncols, nrows, item_ct1);
740
+ });
741
+ });
742
+ }
743
+ }
744
+
745
+ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
746
+ float *dst, const int ncols,
747
+ const int nrows,
748
+ dpct::queue_ptr stream) {
749
+ GGML_ASSERT(ncols % QK_K == 0);
750
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
751
+ const sycl::range<3> block_nums(1, 1, block_num_y);
752
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
753
+ {
754
+
755
+ stream->submit([&](sycl::handler &cgh) {
756
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
757
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
758
+
759
+ cgh.parallel_for(
760
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
761
+ [=](sycl::nd_item<3> item_ct1)
762
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
763
+ mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
764
+ vx, vy, dst, ncols, nrows, item_ct1);
765
+ });
766
+ });
767
+ }
768
+ }
769
+
770
+ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
771
+ float *dst, const int ncols,
772
+ const int nrows,
773
+ dpct::queue_ptr stream) {
774
+ GGML_ASSERT(ncols % QK_K == 0);
775
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
776
+ const sycl::range<3> block_nums(1, 1, block_num_y);
777
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
778
+ {
779
+
780
+ stream->submit([&](sycl::handler &cgh) {
781
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
782
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
783
+
784
+ cgh.parallel_for(
785
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
786
+ [=](sycl::nd_item<3> item_ct1)
787
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
788
+ mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
789
+ vx, vy, dst, ncols, nrows, item_ct1);
790
+ });
791
+ });
792
+ }
793
+ }
794
+
795
+ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
796
+ float *dst, const int ncols,
797
+ const int nrows,
798
+ dpct::queue_ptr stream) {
799
+ GGML_ASSERT(ncols % QK_K == 0);
800
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
801
+ const sycl::range<3> block_nums(1, 1, block_num_y);
802
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
803
+ {
804
+
805
+ stream->submit([&](sycl::handler &cgh) {
806
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
807
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
808
+
809
+ cgh.parallel_for(
810
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
811
+ [=](sycl::nd_item<3> item_ct1)
812
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
813
+ mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
814
+ vx, vy, dst, ncols, nrows, item_ct1);
815
+ });
816
+ });
817
+ }
818
+ }
819
+
820
+ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
821
+ float *dst, const int ncols,
822
+ const int nrows,
823
+ dpct::queue_ptr stream) {
824
+ GGML_ASSERT(ncols % QK_K == 0);
825
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
826
+ const sycl::range<3> block_nums(1, 1, block_num_y);
827
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
828
+ {
829
+
830
+ stream->submit([&](sycl::handler &cgh) {
831
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
832
+
833
+ cgh.parallel_for(
834
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
835
+ [=](sycl::nd_item<3> item_ct1)
836
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
837
+ mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
838
+ vx, vy, dst, ncols, nrows, item_ct1);
839
+ });
840
+ });
841
+ }
842
+ }
843
+
844
+ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
845
+ float *dst, const int ncols,
846
+ const int nrows,
847
+ dpct::queue_ptr stream) {
848
+ GGML_ASSERT(ncols % QK_K == 0);
849
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
850
+ const sycl::range<3> block_nums(1, 1, block_num_y);
851
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
852
+ {
853
+
854
+ stream->submit([&](sycl::handler &cgh) {
855
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
856
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
857
+
858
+ cgh.parallel_for(
859
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
860
+ [=](sycl::nd_item<3> item_ct1)
861
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
862
+ mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
863
+ vx, vy, dst, ncols, nrows, item_ct1);
864
+ });
865
+ });
866
+ }
867
+ }
868
+
869
+ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
870
+ float *dst, const int ncols,
871
+ const int nrows,
872
+ dpct::queue_ptr stream) {
873
+ GGML_ASSERT(ncols % QK_K == 0);
874
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
875
+ const sycl::range<3> block_nums(1, 1, block_num_y);
876
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
877
+ {
878
+ stream->submit([&](sycl::handler &cgh) {
879
+ cgh.parallel_for(
880
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
881
+ [=](sycl::nd_item<3> item_ct1)
882
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
883
+ mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
884
+ vx, vy, dst, ncols, nrows, item_ct1);
885
+ });
886
+ });
887
+ }
888
+ }
889
+
890
+ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
891
+ float *dst, const int ncols,
892
+ const int nrows,
893
+ dpct::queue_ptr stream) {
894
+ GGML_ASSERT(ncols % QK4_NL == 0);
895
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
896
+ const sycl::range<3> block_nums(1, 1, block_num_y);
897
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
898
+ {
899
+
900
+ stream->submit([&](sycl::handler &cgh) {
901
+ cgh.parallel_for(
902
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
903
+ [=](sycl::nd_item<3> item_ct1)
904
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
905
+ mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
906
+ vx, vy, dst, ncols, nrows, item_ct1);
907
+ });
908
+ });
909
+ }
910
+ }
911
+
912
+ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
913
+ float *dst, const int ncols,
914
+ const int nrows,
915
+ dpct::queue_ptr stream) {
916
+ GGML_ASSERT(ncols % QK_K == 0);
917
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
918
+ const sycl::range<3> block_nums(1, 1, block_num_y);
919
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
920
+ {
921
+
922
+ stream->submit([&](sycl::handler &cgh) {
923
+ cgh.parallel_for(
924
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
925
+ [=](sycl::nd_item<3> item_ct1)
926
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
927
+ mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
928
+ vx, vy, dst, ncols, nrows, item_ct1);
929
+ });
930
+ });
931
+ }
932
+ }
933
+
934
+ void ggml_sycl_op_mul_mat_vec_q(
935
+ ggml_backend_sycl_context & ctx,
936
+ const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
937
+ const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
938
+ float *dst_dd_i, const int64_t row_low, const int64_t row_high,
939
+ const int64_t src1_ncols, const int64_t src1_padded_col_size,
940
+ const dpct::queue_ptr &stream) {
941
+
942
+ const int64_t ne10 = src1->ne[0];
943
+ GGML_ASSERT(ne10 % QK8_1 == 0);
944
+
945
+ const int64_t ne00 = src0->ne[0];
946
+ const int64_t row_diff = row_high - row_low;
947
+
948
+ int id;
949
+ SYCL_CHECK(
950
+ CHECK_TRY_ERROR(id = get_current_device_id()));
951
+ const size_t q8_1_ts = sizeof(block_q8_1);
952
+ const size_t q8_1_bs = QK8_1;
953
+ // the main device has a larger memory buffer to hold the results from all GPUs
954
+ // nrows_dst == nrows of the matrix that the kernel writes into
955
+ const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
956
+ for (int i = 0; i < src1_ncols; i++)
957
+ {
958
+ const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
959
+ const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
960
+ float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
961
+ switch (src0->type) {
962
+ case GGML_TYPE_Q4_0:
963
+ mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
964
+ break;
965
+ case GGML_TYPE_Q4_1:
966
+ mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
967
+ break;
968
+ case GGML_TYPE_Q5_0:
969
+ mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
970
+ break;
971
+ case GGML_TYPE_Q5_1:
972
+ mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
973
+ break;
974
+ case GGML_TYPE_Q8_0:
975
+ mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
976
+ break;
977
+ case GGML_TYPE_Q2_K:
978
+ mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
979
+ break;
980
+ case GGML_TYPE_Q3_K:
981
+ mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
982
+ break;
983
+ case GGML_TYPE_Q4_K:
984
+ mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
985
+ break;
986
+ case GGML_TYPE_Q5_K:
987
+ mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
988
+ break;
989
+ case GGML_TYPE_Q6_K:
990
+ mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
991
+ break;
992
+ case GGML_TYPE_IQ1_S:
993
+ mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
994
+ break;
995
+ case GGML_TYPE_IQ1_M:
996
+ mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
997
+ break;
998
+ case GGML_TYPE_IQ2_XXS:
999
+ mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1000
+ break;
1001
+ case GGML_TYPE_IQ2_XS:
1002
+ mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1003
+ break;
1004
+ case GGML_TYPE_IQ2_S:
1005
+ mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1006
+ break;
1007
+ case GGML_TYPE_IQ3_XXS:
1008
+ mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1009
+ break;
1010
+ case GGML_TYPE_IQ3_S:
1011
+ mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1012
+ break;
1013
+ case GGML_TYPE_IQ4_NL:
1014
+ mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1015
+ break;
1016
+ case GGML_TYPE_IQ4_XS:
1017
+ mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1018
+ break;
1019
+ default:
1020
+ GGML_ABORT("fatal error");
1021
+ break;
1022
+ }
1023
+ }
1024
+ (void) src1;
1025
+ (void) dst;
1026
+ (void) src1_ddf_i;
1027
+ }