@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
120
120
  im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
121
121
  }
122
122
 
123
- (void) src0;
124
- (void) src0_dd;
123
+ GGML_UNUSED(src0);
124
+ GGML_UNUSED(src0_dd);
125
+ GGML_UNUSED(ctx);
125
126
  }
@@ -813,7 +813,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
813
813
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
814
814
  }
815
815
 
816
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
816
+ constexpr int blocks_per_tile_x_row = QI4_K > WARP_SIZE ? 1 : WARP_SIZE / QI4_K; // == 1 if QK_K == 256
817
817
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
818
818
 
819
819
  #pragma unroll
@@ -961,7 +961,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
961
961
  x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
962
962
  }
963
963
 
964
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
964
+ constexpr int blocks_per_tile_x_row = QI5_K > WARP_SIZE ? 1 : WARP_SIZE / QI5_K; // == 1 if QK_K == 256
965
965
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
966
966
 
967
967
  #pragma unroll
@@ -1109,7 +1109,7 @@ load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
1109
1109
  dpct::sub_sat());
1110
1110
  }
1111
1111
 
1112
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1112
+ constexpr int blocks_per_tile_x_row = QI6_K > WARP_SIZE ? 1 : WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1113
1113
  const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1114
1114
  float * x_dmf = (float *) x_dm;
1115
1115
 
@@ -3020,9 +3020,9 @@ void ggml_sycl_op_mul_mat_q(
3020
3020
  break;
3021
3021
  }
3022
3022
 
3023
- (void) src1;
3024
- (void) dst;
3025
- (void) src1_ddf_i;
3023
+ GGML_UNUSED(src1);
3024
+ GGML_UNUSED(dst);
3025
+ GGML_UNUSED(src1_ddf_i);
3026
3026
  }
3027
3027
  catch (sycl::exception const &exc) {
3028
3028
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -1,6 +1,6 @@
1
1
  #include "mmvq.hpp"
2
2
  #include "vecdotq.hpp"
3
-
3
+ #include <cassert>
4
4
 
5
5
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
6
6
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
@@ -13,7 +13,8 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
13
13
  }
14
14
 
15
15
  const int blocks_per_row = ncols / qk;
16
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
16
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
17
+ assert(blocks_per_warp>0);
17
18
 
18
19
  // partial sum for each thread
19
20
  float tmp = 0.0f;
@@ -37,7 +38,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
37
38
 
38
39
  // sum up partial sums and write back result
39
40
  #pragma unroll
40
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
41
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
41
42
  tmp +=
42
43
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
43
44
  }
@@ -61,7 +62,8 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
61
62
  }
62
63
 
63
64
  const int blocks_per_row = ncols / qk;
64
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
65
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
66
+ assert(blocks_per_warp>0);
65
67
 
66
68
  // partial sum for each thread
67
69
  float tmp = 0.0f;
@@ -85,7 +87,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
85
87
 
86
88
  // sum up partial sums and write back result
87
89
  #pragma unroll
88
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
90
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
89
91
  tmp +=
90
92
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
91
93
  }
@@ -109,8 +111,8 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
109
111
  }
110
112
 
111
113
  const int blocks_per_row = ncols / qk;
112
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
113
-
114
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
115
+ assert(blocks_per_warp>0);
114
116
  // partial sum for each thread
115
117
  float tmp = 0.0f;
116
118
 
@@ -133,7 +135,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
133
135
 
134
136
  // sum up partial sums and write back result
135
137
  #pragma unroll
136
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
138
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
137
139
  tmp +=
138
140
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
139
141
  }
@@ -157,8 +159,8 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
157
159
  }
158
160
 
159
161
  const int blocks_per_row = ncols / qk;
160
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
161
-
162
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
163
+ assert(blocks_per_warp>0);
162
164
  // partial sum for each thread
163
165
  float tmp = 0.0f;
164
166
 
@@ -181,7 +183,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
181
183
 
182
184
  // sum up partial sums and write back result
183
185
  #pragma unroll
184
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
186
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
185
187
  tmp +=
186
188
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
187
189
  }
@@ -205,8 +207,8 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
205
207
  }
206
208
 
207
209
  const int blocks_per_row = ncols / qk;
208
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
209
-
210
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
211
+ assert(blocks_per_warp>0);
210
212
  // partial sum for each thread
211
213
  float tmp = 0.0f;
212
214
 
@@ -229,7 +231,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
229
231
 
230
232
  // sum up partial sums and write back result
231
233
  #pragma unroll
232
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
234
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
233
235
  tmp +=
234
236
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
235
237
  }
@@ -253,8 +255,8 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
253
255
  }
254
256
 
255
257
  const int blocks_per_row = ncols / qk;
256
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
257
-
258
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
259
+ assert(blocks_per_warp>0);
258
260
  // partial sum for each thread
259
261
  float tmp = 0.0f;
260
262
 
@@ -277,7 +279,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
277
279
 
278
280
  // sum up partial sums and write back result
279
281
  #pragma unroll
280
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
282
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
281
283
  tmp +=
282
284
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
283
285
  }
@@ -301,8 +303,8 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
301
303
  }
302
304
 
303
305
  const int blocks_per_row = ncols / qk;
304
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
305
-
306
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
307
+ assert(blocks_per_warp>0);
306
308
  // partial sum for each thread
307
309
  float tmp = 0.0f;
308
310
 
@@ -325,7 +327,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
325
327
 
326
328
  // sum up partial sums and write back result
327
329
  #pragma unroll
328
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
330
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
329
331
  tmp +=
330
332
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
331
333
  }
@@ -349,8 +351,8 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
349
351
  }
350
352
 
351
353
  const int blocks_per_row = ncols / qk;
352
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
353
-
354
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
355
+ assert(blocks_per_warp>0);
354
356
  // partial sum for each thread
355
357
  float tmp = 0.0f;
356
358
 
@@ -373,7 +375,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
373
375
 
374
376
  // sum up partial sums and write back result
375
377
  #pragma unroll
376
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
378
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
377
379
  tmp +=
378
380
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
379
381
  }
@@ -397,8 +399,8 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
397
399
  }
398
400
 
399
401
  const int blocks_per_row = ncols / qk;
400
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
401
-
402
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
403
+ assert(blocks_per_warp>0);
402
404
  // partial sum for each thread
403
405
  float tmp = 0.0f;
404
406
 
@@ -421,7 +423,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
421
423
 
422
424
  // sum up partial sums and write back result
423
425
  #pragma unroll
424
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
426
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
425
427
  tmp +=
426
428
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
427
429
  }
@@ -446,8 +448,8 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
446
448
  }
447
449
 
448
450
  const int blocks_per_row = ncols / qk;
449
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
450
-
451
+ const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
452
+ assert(blocks_per_warp>0);
451
453
  // partial sum for each thread
452
454
  float tmp = 0.0f;
453
455
 
@@ -470,7 +472,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
470
472
 
471
473
  // sum up partial sums and write back result
472
474
  #pragma unroll
473
- for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
475
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
474
476
  tmp +=
475
477
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
476
478
  }
@@ -487,7 +489,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
487
489
  GGML_ASSERT(ncols % QK4_0 == 0);
488
490
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
489
491
  const sycl::range<3> block_nums(1, 1, block_num_y);
490
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
492
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
491
493
  {
492
494
 
493
495
  stream->submit([&](sycl::handler &cgh) {
@@ -495,7 +497,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
495
497
  cgh.parallel_for(
496
498
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
499
  [=](sycl::nd_item<3> item_ct1)
498
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
500
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
499
501
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
502
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
503
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -511,7 +513,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
511
513
  GGML_ASSERT(ncols % QK4_1 == 0);
512
514
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
513
515
  const sycl::range<3> block_nums(1, 1, block_num_y);
514
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
516
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
515
517
  {
516
518
 
517
519
  stream->submit([&](sycl::handler &cgh) {
@@ -519,7 +521,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
519
521
  cgh.parallel_for(
520
522
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
521
523
  [=](sycl::nd_item<3> item_ct1)
522
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
524
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
523
525
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
524
526
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
525
527
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -535,7 +537,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
535
537
  GGML_ASSERT(ncols % QK5_0 == 0);
536
538
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
537
539
  const sycl::range<3> block_nums(1, 1, block_num_y);
538
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
540
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
539
541
  {
540
542
 
541
543
  stream->submit([&](sycl::handler &cgh) {
@@ -543,7 +545,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
543
545
  cgh.parallel_for(
544
546
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
545
547
  [=](sycl::nd_item<3> item_ct1)
546
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
548
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
547
549
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
548
550
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
549
551
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -559,7 +561,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
559
561
  GGML_ASSERT(ncols % QK5_1 == 0);
560
562
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
561
563
  const sycl::range<3> block_nums(1, 1, block_num_y);
562
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
564
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
563
565
  {
564
566
 
565
567
  stream->submit([&](sycl::handler &cgh) {
@@ -567,7 +569,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
567
569
  cgh.parallel_for(
568
570
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
569
571
  [=](sycl::nd_item<3> item_ct1)
570
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
572
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
571
573
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
572
574
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
573
575
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -583,7 +585,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
583
585
  GGML_ASSERT(ncols % QK8_0 == 0);
584
586
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
585
587
  const sycl::range<3> block_nums(1, 1, block_num_y);
586
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
588
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
587
589
  {
588
590
 
589
591
  stream->submit([&](sycl::handler &cgh) {
@@ -591,7 +593,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
591
593
  cgh.parallel_for(
592
594
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
593
595
  [=](sycl::nd_item<3> item_ct1)
594
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
596
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
595
597
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
596
598
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
597
599
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -607,7 +609,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
607
609
  GGML_ASSERT(ncols % QK_K == 0);
608
610
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
609
611
  const sycl::range<3> block_nums(1, 1, block_num_y);
610
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
612
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
611
613
  {
612
614
 
613
615
  stream->submit([&](sycl::handler &cgh) {
@@ -615,7 +617,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
615
617
  cgh.parallel_for(
616
618
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
617
619
  [=](sycl::nd_item<3> item_ct1)
618
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
620
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
619
621
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
620
622
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
621
623
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -631,7 +633,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
631
633
  GGML_ASSERT(ncols % QK_K == 0);
632
634
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
633
635
  const sycl::range<3> block_nums(1, 1, block_num_y);
634
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
636
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
635
637
  {
636
638
 
637
639
  stream->submit([&](sycl::handler &cgh) {
@@ -639,7 +641,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
639
641
  cgh.parallel_for(
640
642
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
641
643
  [=](sycl::nd_item<3> item_ct1)
642
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
644
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
643
645
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
644
646
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
645
647
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -655,7 +657,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
655
657
  GGML_ASSERT(ncols % QK_K == 0);
656
658
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
657
659
  const sycl::range<3> block_nums(1, 1, block_num_y);
658
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
660
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
659
661
  {
660
662
 
661
663
  stream->submit([&](sycl::handler &cgh) {
@@ -663,7 +665,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
663
665
  cgh.parallel_for(
664
666
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
665
667
  [=](sycl::nd_item<3> item_ct1)
666
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
668
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
667
669
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
668
670
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
669
671
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -679,7 +681,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
679
681
  GGML_ASSERT(ncols % QK_K == 0);
680
682
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
681
683
  const sycl::range<3> block_nums(1, 1, block_num_y);
682
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
684
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
683
685
  {
684
686
 
685
687
  stream->submit([&](sycl::handler &cgh) {
@@ -687,7 +689,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
687
689
  cgh.parallel_for(
688
690
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
689
691
  [=](sycl::nd_item<3> item_ct1)
690
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
692
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
691
693
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
692
694
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
693
695
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -703,7 +705,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
703
705
  GGML_ASSERT(ncols % QK_K == 0);
704
706
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
705
707
  const sycl::range<3> block_nums(1, 1, block_num_y);
706
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
708
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
707
709
  {
708
710
 
709
711
  stream->submit([&](sycl::handler &cgh) {
@@ -711,7 +713,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
711
713
  cgh.parallel_for(
712
714
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
713
715
  [=](sycl::nd_item<3> item_ct1)
714
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
716
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
715
717
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
716
718
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
717
719
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -728,13 +730,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
728
730
  GGML_ASSERT(ncols % QK_K == 0);
729
731
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
730
732
  const sycl::range<3> block_nums(1, 1, block_num_y);
731
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
733
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
732
734
  {
733
735
  stream->submit([&](sycl::handler &cgh) {
734
736
  cgh.parallel_for(
735
737
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
736
738
  [=](sycl::nd_item<3> item_ct1)
737
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
739
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
738
740
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
739
741
  vx, vy, dst, ncols, nrows, item_ct1);
740
742
  });
@@ -749,17 +751,13 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
749
751
  GGML_ASSERT(ncols % QK_K == 0);
750
752
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
751
753
  const sycl::range<3> block_nums(1, 1, block_num_y);
752
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
754
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
753
755
  {
754
-
755
- stream->submit([&](sycl::handler &cgh) {
756
- auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
757
- auto ksigns64_ptr_ct1 = &ksigns64[0];
758
-
756
+ stream->submit([&](sycl::handler & cgh) {
759
757
  cgh.parallel_for(
760
758
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
761
759
  [=](sycl::nd_item<3> item_ct1)
762
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
760
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
763
761
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
764
762
  vx, vy, dst, ncols, nrows, item_ct1);
765
763
  });
@@ -774,17 +772,14 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
774
772
  GGML_ASSERT(ncols % QK_K == 0);
775
773
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
776
774
  const sycl::range<3> block_nums(1, 1, block_num_y);
777
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
775
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
778
776
  {
779
777
 
780
778
  stream->submit([&](sycl::handler &cgh) {
781
- auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
782
- auto ksigns64_ptr_ct1 = &ksigns64[0];
783
-
784
779
  cgh.parallel_for(
785
780
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
786
781
  [=](sycl::nd_item<3> item_ct1)
787
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
782
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
788
783
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
789
784
  vx, vy, dst, ncols, nrows, item_ct1);
790
785
  });
@@ -799,17 +794,14 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
799
794
  GGML_ASSERT(ncols % QK_K == 0);
800
795
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
801
796
  const sycl::range<3> block_nums(1, 1, block_num_y);
802
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
797
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
803
798
  {
804
799
 
805
800
  stream->submit([&](sycl::handler &cgh) {
806
- auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
807
- auto ksigns64_ptr_ct1 = &ksigns64[0];
808
-
809
801
  cgh.parallel_for(
810
802
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
811
803
  [=](sycl::nd_item<3> item_ct1)
812
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
804
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
813
805
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
814
806
  vx, vy, dst, ncols, nrows, item_ct1);
815
807
  });
@@ -824,16 +816,14 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
824
816
  GGML_ASSERT(ncols % QK_K == 0);
825
817
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
826
818
  const sycl::range<3> block_nums(1, 1, block_num_y);
827
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
819
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
828
820
  {
829
821
 
830
822
  stream->submit([&](sycl::handler &cgh) {
831
- auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
832
-
833
823
  cgh.parallel_for(
834
824
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
835
825
  [=](sycl::nd_item<3> item_ct1)
836
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
826
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
837
827
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
838
828
  vx, vy, dst, ncols, nrows, item_ct1);
839
829
  });
@@ -848,17 +838,14 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
848
838
  GGML_ASSERT(ncols % QK_K == 0);
849
839
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
850
840
  const sycl::range<3> block_nums(1, 1, block_num_y);
851
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
841
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
852
842
  {
853
843
 
854
844
  stream->submit([&](sycl::handler &cgh) {
855
- auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
856
- auto ksigns64_ptr_ct1 = &ksigns64[0];
857
-
858
845
  cgh.parallel_for(
859
846
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
860
847
  [=](sycl::nd_item<3> item_ct1)
861
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
848
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
862
849
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
863
850
  vx, vy, dst, ncols, nrows, item_ct1);
864
851
  });
@@ -873,13 +860,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
873
860
  GGML_ASSERT(ncols % QK_K == 0);
874
861
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
875
862
  const sycl::range<3> block_nums(1, 1, block_num_y);
876
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
863
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
877
864
  {
878
865
  stream->submit([&](sycl::handler &cgh) {
879
866
  cgh.parallel_for(
880
867
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
881
868
  [=](sycl::nd_item<3> item_ct1)
882
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
869
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
883
870
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
884
871
  vx, vy, dst, ncols, nrows, item_ct1);
885
872
  });
@@ -894,14 +881,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
894
881
  GGML_ASSERT(ncols % QK4_NL == 0);
895
882
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
896
883
  const sycl::range<3> block_nums(1, 1, block_num_y);
897
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
884
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
898
885
  {
899
886
 
900
887
  stream->submit([&](sycl::handler &cgh) {
901
888
  cgh.parallel_for(
902
889
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
903
890
  [=](sycl::nd_item<3> item_ct1)
904
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
891
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
905
892
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
906
893
  vx, vy, dst, ncols, nrows, item_ct1);
907
894
  });
@@ -916,14 +903,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
916
903
  GGML_ASSERT(ncols % QK_K == 0);
917
904
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
918
905
  const sycl::range<3> block_nums(1, 1, block_num_y);
919
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
906
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
920
907
  {
921
908
 
922
909
  stream->submit([&](sycl::handler &cgh) {
923
910
  cgh.parallel_for(
924
911
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
925
912
  [=](sycl::nd_item<3> item_ct1)
926
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
913
+ [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
927
914
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
928
915
  vx, vy, dst, ncols, nrows, item_ct1);
929
916
  });
@@ -952,7 +939,7 @@ void ggml_sycl_op_mul_mat_vec_q(
952
939
  const size_t q8_1_bs = QK8_1;
953
940
  // the main device has a larger memory buffer to hold the results from all GPUs
954
941
  // nrows_dst == nrows of the matrix that the kernel writes into
955
- const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
942
+
956
943
  for (int i = 0; i < src1_ncols; i++)
957
944
  {
958
945
  const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
@@ -1021,7 +1008,8 @@ void ggml_sycl_op_mul_mat_vec_q(
1021
1008
  break;
1022
1009
  }
1023
1010
  }
1024
- (void) src1;
1025
- (void) dst;
1026
- (void) src1_ddf_i;
1011
+ GGML_UNUSED(src1);
1012
+ GGML_UNUSED(dst);
1013
+ GGML_UNUSED(src1_ddf_i);
1014
+ GGML_UNUSED(ctx);
1027
1015
  }