@fugood/llama.node 0.6.2 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (378) hide show
  1. package/CMakeLists.txt +40 -30
  2. package/README.md +4 -1
  3. package/lib/binding.js +41 -29
  4. package/lib/binding.ts +26 -25
  5. package/package.json +45 -10
  6. package/scripts/build.js +47 -0
  7. package/scripts/llama.cpp.patch +109 -0
  8. package/src/anyascii.c +22223 -0
  9. package/src/anyascii.h +42 -0
  10. package/src/tts_utils.cpp +20 -7
  11. package/src/tts_utils.h +2 -0
  12. package/bin/darwin/arm64/llama-node.node +0 -0
  13. package/bin/darwin/x64/llama-node.node +0 -0
  14. package/bin/linux/arm64/llama-node.node +0 -0
  15. package/bin/linux/x64/llama-node.node +0 -0
  16. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  17. package/bin/linux-cuda/x64/llama-node.node +0 -0
  18. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  19. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  20. package/bin/win32/x64/llama-node.node +0 -0
  21. package/bin/win32/x64/node.lib +0 -0
  22. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  23. package/bin/win32-vulkan/arm64/node.lib +0 -0
  24. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  25. package/bin/win32-vulkan/x64/node.lib +0 -0
  26. package/patches/node-api-headers+1.1.0.patch +0 -26
  27. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
  28. package/src/llama.cpp/.github/workflows/build.yml +0 -1078
  29. package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
  30. package/src/llama.cpp/.github/workflows/docker.yml +0 -178
  31. package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
  32. package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
  33. package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
  37. package/src/llama.cpp/.github/workflows/release.yml +0 -739
  38. package/src/llama.cpp/.github/workflows/server.yml +0 -237
  39. package/src/llama.cpp/.github/workflows/winget.yml +0 -42
  40. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
  41. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
  42. package/src/llama.cpp/cmake/build-info.cmake +0 -64
  43. package/src/llama.cpp/cmake/common.cmake +0 -35
  44. package/src/llama.cpp/cmake/git-vars.cmake +0 -22
  45. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
  46. package/src/llama.cpp/common/build-info.cpp.in +0 -4
  47. package/src/llama.cpp/docs/build.md +0 -561
  48. package/src/llama.cpp/examples/CMakeLists.txt +0 -43
  49. package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
  50. package/src/llama.cpp/examples/batched/batched.cpp +0 -246
  51. package/src/llama.cpp/examples/chat-13B.bat +0 -57
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
  53. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
  54. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
  55. package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
  56. package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
  57. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
  58. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
  59. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
  61. package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
  62. package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
  63. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
  64. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
  65. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
  66. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
  67. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
  68. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
  69. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
  70. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
  71. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
  72. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
  73. package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
  74. package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
  75. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
  76. package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
  77. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
  78. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
  79. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
  80. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
  81. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
  82. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
  83. package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
  84. package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
  85. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
  86. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
  87. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
  88. package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
  89. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
  90. package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
  91. package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
  92. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
  93. package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
  94. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
  95. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
  96. package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
  97. package/src/llama.cpp/examples/simple/simple.cpp +0 -206
  98. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
  99. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
  100. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
  101. package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
  102. package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
  103. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
  104. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
  105. package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
  106. package/src/llama.cpp/examples/sycl/build.sh +0 -23
  107. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
  108. package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
  109. package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
  110. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
  111. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
  112. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
  113. package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
  114. package/src/llama.cpp/examples/training/finetune.cpp +0 -96
  115. package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
  116. package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
  117. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
  118. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
  119. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
  120. package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
  121. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
  122. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
  123. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
  124. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
  125. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
  126. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
  127. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
  128. package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
  129. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
  130. package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
  131. package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
  132. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
  133. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
  134. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
  135. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
  136. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
  137. package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
  138. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  139. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  140. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
  141. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
  142. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
  143. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
  144. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
  145. package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
  146. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
  147. package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
  148. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
  149. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
  150. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
  151. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
  152. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
  153. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
  154. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
  155. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
  156. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
  157. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
  158. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
  159. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
  160. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
  161. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
  162. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
  163. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
  164. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
  165. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
  166. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
  167. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
  168. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
  169. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
  170. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
  171. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
  172. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
  173. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
  174. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
  175. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
  176. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
  177. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
  178. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
  181. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
  182. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
  183. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
  184. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
  185. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
  186. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
  187. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
  188. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
  189. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
  190. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
  191. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
  192. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
  193. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
  194. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
  195. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
  196. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
  197. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
  198. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
  199. package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
  200. package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
  201. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
  202. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
  203. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
  204. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
  205. package/src/llama.cpp/ggml/src/ggml.c +0 -6550
  206. package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
  207. package/src/llama.cpp/models/.editorconfig +0 -1
  208. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  209. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  210. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  211. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  212. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  213. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  214. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  215. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  216. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  217. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  218. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  220. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  221. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  223. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  226. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  227. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  228. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  229. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  230. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  231. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  232. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  233. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  234. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  236. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  238. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  241. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  242. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  249. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  250. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  253. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  256. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  257. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  258. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
  259. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  260. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  261. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  262. package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
  263. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
  264. package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
  265. package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
  266. package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
  267. package/src/llama.cpp/prompts/alpaca.txt +0 -1
  268. package/src/llama.cpp/prompts/assistant.txt +0 -31
  269. package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  270. package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
  271. package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  272. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  273. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  274. package/src/llama.cpp/prompts/chat.txt +0 -28
  275. package/src/llama.cpp/prompts/dan-modified.txt +0 -1
  276. package/src/llama.cpp/prompts/dan.txt +0 -1
  277. package/src/llama.cpp/prompts/mnemonics.txt +0 -93
  278. package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
  279. package/src/llama.cpp/prompts/reason-act.txt +0 -18
  280. package/src/llama.cpp/requirements/requirements-all.txt +0 -15
  281. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
  282. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
  283. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
  284. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
  285. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
  286. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
  287. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
  288. package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
  289. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
  290. package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
  291. package/src/llama.cpp/requirements.txt +0 -13
  292. package/src/llama.cpp/scripts/build-info.sh +0 -30
  293. package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
  294. package/src/llama.cpp/scripts/xxd.cmake +0 -16
  295. package/src/llama.cpp/tests/CMakeLists.txt +0 -177
  296. package/src/llama.cpp/tests/get-model.cpp +0 -21
  297. package/src/llama.cpp/tests/get-model.h +0 -2
  298. package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
  299. package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
  300. package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
  301. package/src/llama.cpp/tests/test-barrier.cpp +0 -94
  302. package/src/llama.cpp/tests/test-c.c +0 -7
  303. package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
  304. package/src/llama.cpp/tests/test-chat.cpp +0 -985
  305. package/src/llama.cpp/tests/test-double-float.cpp +0 -57
  306. package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
  307. package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
  308. package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
  309. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
  310. package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
  311. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
  312. package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
  313. package/src/llama.cpp/tests/test-log.cpp +0 -39
  314. package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
  315. package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
  316. package/src/llama.cpp/tests/test-opt.cpp +0 -904
  317. package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
  318. package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
  319. package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
  320. package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
  321. package/src/llama.cpp/tests/test-rope.cpp +0 -262
  322. package/src/llama.cpp/tests/test-sampling.cpp +0 -399
  323. package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
  324. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
  325. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
  326. package/src/llama.cpp/tools/CMakeLists.txt +0 -39
  327. package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
  328. package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
  329. package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
  330. package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
  331. package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
  332. package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
  333. package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
  334. package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
  335. package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
  336. package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
  337. package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
  338. package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
  339. package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
  340. package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
  341. package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
  342. package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
  343. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
  344. package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
  345. package/src/llama.cpp/tools/main/main.cpp +0 -977
  346. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
  347. package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
  348. package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
  349. package/src/llama.cpp/tools/mtmd/clip.h +0 -101
  350. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
  351. package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
  352. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
  353. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
  354. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
  355. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
  356. package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
  357. package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
  358. package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
  359. package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
  360. package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
  361. package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
  362. package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
  363. package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
  364. package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
  365. package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
  366. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
  367. package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
  368. package/src/llama.cpp/tools/run/run.cpp +0 -1261
  369. package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
  370. package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
  371. package/src/llama.cpp/tools/server/httplib.h +0 -10506
  372. package/src/llama.cpp/tools/server/server.cpp +0 -4966
  373. package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
  374. package/src/llama.cpp/tools/server/utils.hpp +0 -1337
  375. package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
  376. package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
  377. package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
  378. package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
@@ -1,1160 +0,0 @@
1
- #include "convert.hpp"
2
- #include "dmmv.hpp"
3
- #include "dequantize.hpp"
4
- #include "presets.hpp"
5
-
6
- static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
7
- const sycl::half *x = (const sycl::half *)vx;
8
-
9
- // automatic half -> float type cast if dfloat == float
10
- v.x() = x[ib + iqs + 0];
11
- v.y() = x[ib + iqs + 1];
12
- }
13
-
14
- static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
15
- const float * x = (const float *) vx;
16
-
17
- // automatic half -> float type cast if dfloat == float
18
- v.x() = x[ib + iqs + 0];
19
- v.y() = x[ib + iqs + 1];
20
- }
21
-
22
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
23
- static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
24
- const sycl::nd_item<3> &item_ct1) {
25
- // qk = quantized weights per x block
26
- // qr = number of quantized weights per data value in x block
27
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
28
- item_ct1.get_local_id(1);
29
-
30
- if (row >= nrows) {
31
- return;
32
- }
33
-
34
- const int tid = item_ct1.get_local_id(2);
35
-
36
- const int iter_stride = 2*GGML_SYCL_DMMV_X;
37
- const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
38
- const int y_offset = qr == 1 ? 1 : qk/2;
39
-
40
- // partial sum for each thread
41
- #ifdef GGML_SYCL_F16
42
- sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
43
- #else
44
- float tmp = 0.0f;
45
- #endif // GGML_SYCL_F16
46
-
47
- for (int i = 0; i < ncols; i += iter_stride) {
48
- const int col = i + vals_per_iter*tid;
49
- const int ib = (row*ncols + col)/qk; // x block index
50
- const int iqs = (col%qk)/qr; // x quant index
51
- const int iybs = col - col%qk; // y block start index
52
-
53
- // processing >2 values per i iter is faster for fast GPUs
54
- #pragma unroll
55
- for (int j = 0; j < vals_per_iter; j += 2) {
56
- // process 2 vals per j iter
57
-
58
- // dequantize
59
- // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
60
- dfloat2 v;
61
- dequantize_kernel(vx, ib, iqs + j/qr, v);
62
-
63
- // matrix multiplication
64
- // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
65
- #ifdef GGML_SYCL_F16
66
- dfloat2 t1{y[iybs + iqs + j / qr + 0],
67
- y[iybs + iqs + j / qr + y_offset]};
68
-
69
- tmp += v * t1;
70
- #else
71
- tmp += v.x() * y[iybs + iqs + j / qr + 0];
72
- tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
73
- #endif // GGML_SYCL_F16
74
- }
75
- }
76
-
77
- // sum up partial sums and write back result
78
- const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
79
- for (int mask = mask_start; mask > 0; mask >>= 1) {
80
- tmp +=
81
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
82
- }
83
-
84
- if (tid == 0) {
85
- #ifdef GGML_SYCL_F16
86
- dst[row] = tmp.x() + tmp.y();
87
- #else
88
- dst[row] = tmp;
89
- #endif // GGML_SYCL_F16
90
- }
91
- }
92
-
93
- template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
94
- static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
95
- const sycl::nd_item<3> &item_ct1) {
96
- // qk = quantized weights per x block
97
- // qr = number of quantized weights per data value in x block
98
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
99
- item_ct1.get_local_id(1);
100
-
101
- if (row >= nrows) {
102
- return;
103
- }
104
-
105
- const int tid = item_ct1.get_local_id(2);
106
-
107
-
108
- const int ncols_left = ncols % (QK4_0*WARP_SIZE);
109
- const int ncols_align = ncols - ncols_left;
110
- const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
111
- const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16
112
- const int y_offset = qr == 1 ? 1 : qk/2;
113
-
114
- // partial sum for each thread
115
- #ifdef GGML_SYCL_F16
116
- sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
117
- #else
118
- float tmp = 0.0f;
119
- #endif // GGML_SYCL_F16
120
- const char *d_ptr = (const char*)vx+ncols*nrows/2;
121
- int i=0;
122
- for (i = 0; i < ncols_align; i += iter_stride) {
123
- const int col = i + vals_per_iter*tid;
124
- const int ib = (row*ncols + col)/qk; // x block index
125
- const int iqs = (col%qk)/qr; // x quant index
126
- const int iybs = col - col%qk; // y block start index
127
-
128
- // processing >2 values per i iter is faster for fast GPUs
129
- #pragma unroll
130
- for (int j = 0; j < vals_per_iter; j += 2) {
131
- // process 2 vals per j iter
132
-
133
- // dequantize
134
- // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
135
- dfloat2 v;
136
- dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
137
-
138
- // matrix multiplication
139
- // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
140
- #ifdef GGML_SYCL_F16
141
- dfloat2 t1{y[iybs + iqs + j / qr + 0],
142
- y[iybs + iqs + j / qr + y_offset]};
143
-
144
- tmp += v * t1;
145
- #else
146
- tmp += v.x() * y[iybs + iqs + j / qr + 0];
147
- tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
148
- #endif // GGML_SYCL_F16
149
- }
150
- }
151
-
152
- for (; i < ncols; i += iter_stride) {
153
- if (tid>=ncols_left/QK4_0) continue;
154
- const int col = i + vals_per_iter*tid;
155
- const int ib = (row*ncols + col)/qk; // x block index
156
- const int iqs = (col%qk)/qr; // x quant index
157
- const int iybs = col - col%qk; // y block start index
158
-
159
- // processing >2 values per i iter is faster for fast GPUs
160
- #pragma unroll
161
- for (int j = 0; j < vals_per_iter; j += 2) {
162
- // process 2 vals per j iter
163
-
164
- // dequantize
165
- // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
166
- dfloat2 v;
167
- dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
168
-
169
- // matrix multiplication
170
- // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
171
- #ifdef GGML_SYCL_F16
172
- dfloat2 t1{y[iybs + iqs + j / qr + 0],
173
- y[iybs + iqs + j / qr + y_offset]};
174
-
175
- tmp += v * t1;
176
- #else
177
- tmp += v.x() * y[iybs + iqs + j / qr + 0];
178
- tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
179
- #endif // GGML_SYCL_F16
180
- }
181
- }
182
-
183
- // sum up partial sums and write back result
184
- const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
185
- for (int mask = mask_start; mask > 0; mask >>= 1) {
186
- tmp +=
187
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
188
- }
189
-
190
- if (tid == 0) {
191
- #ifdef GGML_SYCL_F16
192
- dst[row] = tmp.x() + tmp.y();
193
- #else
194
- dst[row] = tmp;
195
- #endif // GGML_SYCL_F16
196
- }
197
- }
198
-
199
- static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
200
- float *dst, const int ncols,
201
- const int nrows,
202
- dpct::queue_ptr stream) {
203
- GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
204
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
205
- const sycl::range<3> block_nums(1, 1, block_num_y);
206
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
207
- {
208
- dpct::has_capability_or_fail(stream->get_device(),
209
- {sycl::aspect::fp16});
210
-
211
- stream->parallel_for(
212
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
213
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
214
- dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
215
- nrows, item_ct1);
216
- });
217
- }
218
- }
219
-
220
- /*
221
- DPCT1110:4: The total declared local variable size in device function
222
- dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
223
- pressure. Consult with your hardware vendor to find the total register size
224
- available and adjust the code, or use smaller sub-group size to avoid high
225
- register pressure.
226
- */
227
- static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
228
- const float *__restrict__ yy,
229
- float *__restrict__ dst,
230
- const int ncols, int nrows,
231
- const sycl::nd_item<3> &item_ct1) {
232
-
233
- static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
234
-
235
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
236
- item_ct1.get_local_id(1);
237
- if (row > nrows) return;
238
-
239
- const int num_blocks_per_row = ncols / QK_K;
240
- const int ib0 = row*num_blocks_per_row;
241
-
242
- const block_q2_K * x = (const block_q2_K *)vx + ib0;
243
-
244
- float tmp = 0; // partial sum for thread in warp
245
-
246
- #if QK_K == 256
247
- const int tid =
248
- item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
249
- const int ix =
250
- item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
251
-
252
- const int step = 16/K_QUANTS_PER_ITERATION;
253
-
254
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
255
- const int in = tid - step*im; // 0...15 or 0...7
256
-
257
- const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
258
- const int q_offset = 32*im + l0;
259
- const int s_offset = 8*im;
260
- const int y_offset = 128*im + l0;
261
-
262
- uint32_t aux[4];
263
- const uint8_t * d = (const uint8_t *)aux;
264
- const uint8_t * m = (const uint8_t *)(aux + 2);
265
-
266
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
267
-
268
- const float * y = yy + i * QK_K + y_offset;
269
- const uint8_t * q = x[i].qs + q_offset;
270
-
271
- const float dall = x[i].dm[0];
272
- const float dmin = x[i].dm[1];
273
-
274
- const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
275
- aux[0] = a[0] & 0x0f0f0f0f;
276
- aux[1] = a[1] & 0x0f0f0f0f;
277
- aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
278
- aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
279
-
280
- float sum1 = 0, sum2 = 0;
281
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
282
- sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
283
- + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
284
- + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
285
- + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
286
- + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
287
- + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
288
- + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
289
- +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
290
- sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
291
- + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
292
-
293
- }
294
- tmp += dall * sum1 - dmin * sum2;
295
-
296
- }
297
- #else
298
- const int tid = item_ct1.get_local_id(2) /
299
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
300
- const int ix = item_ct1.get_local_id(2) %
301
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
302
- const int offset = tid * K_QUANTS_PER_ITERATION;
303
-
304
- uint32_t uaux[2];
305
- const uint8_t * d = (const uint8_t *)uaux;
306
-
307
-
308
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
309
-
310
- const float * y = yy + i * QK_K + offset;
311
- const uint8_t * q = x[i].qs + offset;
312
- const uint32_t * s = (const uint32_t *)x[i].scales;
313
-
314
- uaux[0] = s[0] & 0x0f0f0f0f;
315
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
316
-
317
- const sycl::float2 dall =
318
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
319
-
320
- float sum1 = 0, sum2 = 0;
321
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
322
- const uint8_t ql = q[l];
323
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
324
- + y[l+16] * d[1] * ((ql >> 2) & 3)
325
- + y[l+32] * d[2] * ((ql >> 4) & 3)
326
- + y[l+48] * d[3] * ((ql >> 6) & 3);
327
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
328
- }
329
- tmp += dall.x() * sum1 - dall.y() * sum2;
330
- }
331
-
332
- #endif
333
-
334
- // sum up partial sums and write back result
335
- #pragma unroll
336
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
337
- tmp +=
338
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
339
- }
340
-
341
- if (item_ct1.get_local_id(2) == 0) {
342
- dst[row] = tmp;
343
- }
344
- }
345
-
346
- /*
347
- DPCT1110:5: The total declared local variable size in device function
348
- dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
349
- pressure. Consult with your hardware vendor to find the total register size
350
- available and adjust the code, or use smaller sub-group size to avoid high
351
- register pressure.
352
- */
353
- static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
354
- const float *__restrict__ yy,
355
- float *__restrict__ dst,
356
- const int ncols, int nrows,
357
- const sycl::nd_item<3> &item_ct1) {
358
-
359
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
360
- item_ct1.get_local_id(1);
361
- if (row > nrows) return;
362
-
363
- const int num_blocks_per_row = ncols / QK_K;
364
- const int ib0 = row*num_blocks_per_row;
365
-
366
- const block_q3_K * x = (const block_q3_K *)vx + ib0;
367
-
368
- float tmp = 0; // partial sum for thread in warp
369
-
370
- #if QK_K == 256
371
-
372
- const uint16_t kmask1 = 0x0303;
373
- const uint16_t kmask2 = 0x0f0f;
374
-
375
- const int tid =
376
- item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
377
- const int ix =
378
- item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
379
-
380
- const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
381
- const int step = 16/K_QUANTS_PER_ITERATION;
382
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
383
- const int in = tid - step*im; // 0....15 or 0...7
384
-
385
- const uint8_t m = 1 << (4*im);
386
-
387
- const int l0 = n*in; // 0...15 or 0...14 in steps of 2
388
- const int q_offset = 32*im + l0;
389
- const int y_offset = 128*im + l0;
390
-
391
- uint16_t utmp[4];
392
- const int8_t * s = (const int8_t *)utmp;
393
-
394
- const uint16_t s_shift = 4*im;
395
-
396
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
397
-
398
- const float * y = yy + i * QK_K + y_offset;
399
- const uint8_t * q = x[i].qs + q_offset;
400
- const uint8_t * h = x[i].hmask + l0;
401
-
402
- const uint16_t * a = (const uint16_t *)x[i].scales;
403
- utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
404
- utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
405
- utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
406
- utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
407
-
408
- const float d = x[i].d;
409
-
410
- float sum = 0;
411
- for (int l = 0; l < n; ++l) {
412
- sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
413
- + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
414
- + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
415
- + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
416
- sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
417
- + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
418
- + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
419
- + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
420
- }
421
- tmp += d * sum;
422
-
423
- }
424
- #else
425
-
426
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
427
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
428
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
429
- const int in = offset/8; // 0 or 1
430
- const int im = offset%8; // 0...7
431
-
432
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
433
-
434
- const float * y = yy + i * QK_K + offset;
435
- const uint8_t * q = x[i].qs + offset;
436
- const uint8_t * s = x[i].scales;
437
-
438
- const float dall = (float)x[i].d;
439
-
440
- float sum = 0;
441
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
442
- const uint8_t hl = x[i].hmask[im+l] >> in;
443
- const uint8_t ql = q[l];
444
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
445
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
446
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
447
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
448
- }
449
- tmp += sum;
450
- }
451
- #endif
452
-
453
- // sum up partial sums and write back result
454
- #pragma unroll
455
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
456
- tmp +=
457
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
458
- }
459
-
460
- if (item_ct1.get_local_id(2) == 0) {
461
- dst[row] = tmp;
462
- }
463
- }
464
-
465
- /*
466
- DPCT1110:6: The total declared local variable size in device function
467
- dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
468
- pressure. Consult with your hardware vendor to find the total register size
469
- available and adjust the code, or use smaller sub-group size to avoid high
470
- register pressure.
471
- */
472
- static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
473
- const float *__restrict__ yy,
474
- float *__restrict__ dst,
475
- const int ncols, int nrows,
476
- const sycl::nd_item<3> &item_ct1) {
477
-
478
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
479
- item_ct1.get_local_id(1);
480
- if (row > nrows) return;
481
- const int num_blocks_per_row = ncols / QK_K;
482
- const int ib0 = row*num_blocks_per_row;
483
-
484
- const block_q4_K * x = (const block_q4_K *)vx + ib0;
485
-
486
- #if QK_K == 256
487
- const uint16_t kmask1 = 0x3f3f;
488
- const uint16_t kmask2 = 0x0f0f;
489
- const uint16_t kmask3 = 0xc0c0;
490
-
491
- const int tid =
492
- item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
493
- const int ix =
494
- item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
495
-
496
- const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
497
-
498
- const int il = tid/step; // 0...3
499
- const int ir = tid - step*il; // 0...7 or 0...3
500
- const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
501
-
502
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
503
- const int in = il%2;
504
-
505
- const int l0 = n*(2*ir + in);
506
- const int q_offset = 32*im + l0;
507
- const int y_offset = 64*im + l0;
508
-
509
- uint16_t aux[4];
510
- const uint8_t * sc = (const uint8_t *)aux;
511
-
512
- #if K_QUANTS_PER_ITERATION == 2
513
- uint32_t q32[4];
514
- const uint8_t * q4 = (const uint8_t *)q32;
515
- #else
516
- uint16_t q16[4];
517
- const uint8_t * q4 = (const uint8_t *)q16;
518
- #endif
519
-
520
- float tmp = 0; // partial sum for thread in warp
521
-
522
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
523
-
524
- const float * y1 = yy + i*QK_K + y_offset;
525
- const float * y2 = y1 + 128;
526
-
527
- const float dall = x[i].dm[0];
528
- const float dmin = x[i].dm[1];
529
-
530
- const uint16_t * a = (const uint16_t *)x[i].scales;
531
- aux[0] = a[im+0] & kmask1;
532
- aux[1] = a[im+2] & kmask1;
533
- aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
534
- aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
535
-
536
- #if K_QUANTS_PER_ITERATION == 2
537
- const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
538
- const uint32_t * q2 = q1 + 16;
539
-
540
- q32[0] = q1[0] & 0x0f0f0f0f;
541
- q32[1] = q1[0] & 0xf0f0f0f0;
542
- q32[2] = q2[0] & 0x0f0f0f0f;
543
- q32[3] = q2[0] & 0xf0f0f0f0;
544
-
545
- sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
546
- float smin = 0;
547
- for (int l = 0; l < 4; ++l) {
548
- s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
549
- s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
550
- smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
551
- }
552
- tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
553
- s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
554
- dmin * smin;
555
- #else
556
- const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
557
- const uint16_t * q2 = q1 + 32;
558
-
559
- q16[0] = q1[0] & 0x0f0f;
560
- q16[1] = q1[0] & 0xf0f0;
561
- q16[2] = q2[0] & 0x0f0f;
562
- q16[3] = q2[0] & 0xf0f0;
563
-
564
- float4 s = {0.f, 0.f, 0.f, 0.f};
565
- float smin = 0;
566
- for (int l = 0; l < 2; ++l) {
567
- s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
568
- s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
569
- smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
570
- }
571
- tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
572
- #endif
573
-
574
- }
575
- #else
576
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
577
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
578
-
579
- const int step = tid * K_QUANTS_PER_ITERATION;
580
-
581
- uint16_t aux16[2];
582
- const uint8_t * s = (const uint8_t *)aux16;
583
-
584
- float tmp = 0;
585
-
586
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
587
- const uint8_t * q = x[i].qs + step;
588
- const float * y = yy + i*QK_K + step;
589
- const uint16_t * a = (const uint16_t *)x[i].scales;
590
- aux16[0] = a[0] & 0x0f0f;
591
- aux16[1] = (a[0] >> 4) & 0x0f0f;
592
- const float d = (float)x[i].dm[0];
593
- const float m = (float)x[i].dm[1];
594
- float sum = 0.f;
595
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
596
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
597
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
598
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
599
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
600
- }
601
- tmp += sum;
602
- }
603
-
604
- #endif
605
-
606
- // sum up partial sums and write back result
607
- #pragma unroll
608
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
609
- tmp +=
610
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
611
- }
612
-
613
- if (tid == 0) {
614
- dst[row] = tmp;
615
- }
616
- }
617
-
618
- /*
619
- DPCT1110:7: The total declared local variable size in device function
620
- dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
621
- pressure. Consult with your hardware vendor to find the total register size
622
- available and adjust the code, or use smaller sub-group size to avoid high
623
- register pressure.
624
- */
625
- static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
626
- const float *__restrict__ yy,
627
- float *__restrict__ dst,
628
- const int ncols,
629
- const sycl::nd_item<3> &item_ct1) {
630
-
631
- const int row = item_ct1.get_group(2);
632
- const int num_blocks_per_row = ncols / QK_K;
633
- const int ib0 = row*num_blocks_per_row;
634
-
635
- const block_q5_K * x = (const block_q5_K *)vx + ib0;
636
-
637
- float tmp = 0; // partial sum for thread in warp
638
-
639
- #if QK_K == 256
640
- const uint16_t kmask1 = 0x3f3f;
641
- const uint16_t kmask2 = 0x0f0f;
642
- const uint16_t kmask3 = 0xc0c0;
643
-
644
- const int tid = item_ct1.get_local_id(2) / 2; // 0...15
645
- const int ix = item_ct1.get_local_id(2) % 2;
646
-
647
- const int il = tid/4; // 0...3
648
- const int ir = tid - 4*il;// 0...3
649
- const int n = 2;
650
-
651
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
652
- const int in = il%2;
653
-
654
- const int l0 = n*(2*ir + in);
655
- const int q_offset = 32*im + l0;
656
- const int y_offset = 64*im + l0;
657
-
658
- const uint8_t hm1 = 1 << (2*im);
659
- const uint8_t hm2 = hm1 << 4;
660
-
661
- uint16_t aux[4];
662
- const uint8_t * sc = (const uint8_t *)aux;
663
-
664
- uint16_t q16[8];
665
- const uint8_t * q4 = (const uint8_t *)q16;
666
-
667
- for (int i = ix; i < num_blocks_per_row; i += 2) {
668
-
669
- const uint8_t * ql1 = x[i].qs + q_offset;
670
- const uint8_t * qh = x[i].qh + l0;
671
- const float * y1 = yy + i*QK_K + y_offset;
672
- const float * y2 = y1 + 128;
673
-
674
- const float dall = x[i].dm[0];
675
- const float dmin = x[i].dm[1];
676
-
677
- const uint16_t * a = (const uint16_t *)x[i].scales;
678
- aux[0] = a[im+0] & kmask1;
679
- aux[1] = a[im+2] & kmask1;
680
- aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
681
- aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
682
-
683
- sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
684
- float smin = 0;
685
- const uint16_t * q1 = (const uint16_t *)ql1;
686
- const uint16_t * q2 = q1 + 32;
687
- q16[0] = q1[0] & 0x0f0f;
688
- q16[1] = q1[8] & 0x0f0f;
689
- q16[2] = (q1[0] >> 4) & 0x0f0f;
690
- q16[3] = (q1[8] >> 4) & 0x0f0f;
691
- q16[4] = q2[0] & 0x0f0f;
692
- q16[5] = q2[8] & 0x0f0f;
693
- q16[6] = (q2[0] >> 4) & 0x0f0f;
694
- q16[7] = (q2[8] >> 4) & 0x0f0f;
695
- for (int l = 0; l < n; ++l) {
696
- sum.x() +=
697
- y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
698
- y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
699
- sum.y() +=
700
- y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
701
- y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
702
- sum.z() +=
703
- y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
704
- y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
705
- sum.w() +=
706
- y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
707
- y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
708
- smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
709
- + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
710
- }
711
- tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
712
- sum.w() * sc[5]) -
713
- dmin * smin;
714
- }
715
-
716
- #else
717
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
718
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
719
- const int step = tid * K_QUANTS_PER_ITERATION;
720
- const int im = step/8;
721
- const int in = step%8;
722
-
723
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
724
- const uint8_t * q = x[i].qs + step;
725
- const int8_t * s = x[i].scales;
726
- const float * y = yy + i*QK_K + step;
727
- const float d = x[i].d;
728
- float sum = 0.f;
729
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
730
- const uint8_t h = x[i].qh[in+j] >> im;
731
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
732
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
733
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
734
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
735
- }
736
- tmp += sum;
737
- }
738
- #endif
739
-
740
- // sum up partial sums and write back result
741
- #pragma unroll
742
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
743
- tmp +=
744
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
745
- }
746
-
747
- if (item_ct1.get_local_id(2) == 0) {
748
- dst[row] = tmp;
749
- }
750
- }
751
-
752
- static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
753
- const sycl::nd_item<3> &item_ct1) {
754
-
755
- static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
756
-
757
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
758
- item_ct1.get_local_id(1);
759
- if (row > nrows) return;
760
-
761
- const int num_blocks_per_row = ncols / QK_K;
762
- const int ib0 = row*num_blocks_per_row;
763
-
764
- const block_q6_K * x = (const block_q6_K *)vx + ib0;
765
-
766
- #if QK_K == 256
767
-
768
- const int tid =
769
- item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
770
- const int ix =
771
- item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
772
-
773
- const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
774
-
775
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
776
- const int in = tid - step*im; // 0...15 or 0...7
777
-
778
- #if K_QUANTS_PER_ITERATION == 1
779
- const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
780
- const int is = 0;
781
- #else
782
- const int l0 = 4 * in; // 0, 4, 8, ..., 28
783
- const int is = in / 4;
784
- #endif
785
- const int ql_offset = 64*im + l0;
786
- const int qh_offset = 32*im + l0;
787
- const int s_offset = 8*im + is;
788
- const int y_offset = 128*im + l0;
789
-
790
- float tmp = 0; // partial sum for thread in warp
791
-
792
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
793
-
794
- const float * y = yy + i * QK_K + y_offset;
795
- const uint8_t * ql = x[i].ql + ql_offset;
796
- const uint8_t * qh = x[i].qh + qh_offset;
797
- const int8_t * s = x[i].scales + s_offset;
798
-
799
- const float d = x[i].d;
800
-
801
- #if K_QUANTS_PER_ITERATION == 1
802
- float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
803
- + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
804
- + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
805
- + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
806
- + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
807
- + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
808
- + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
809
- +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
810
- tmp += sum;
811
- #else
812
- float sum = 0;
813
- for (int l = 0; l < 4; ++l) {
814
- sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
815
- + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
816
- + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
817
- + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
818
- }
819
- tmp += sum;
820
- #endif
821
-
822
- }
823
-
824
- #else
825
-
826
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
827
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
828
-
829
- const int step = tid * K_QUANTS_PER_ITERATION;
830
-
831
- float tmp = 0; // partial sum for thread in warp
832
-
833
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
834
-
835
- const float * y = yy + i * QK_K + step;
836
- const uint8_t * ql = x[i].ql + step;
837
- const uint8_t * qh = x[i].qh + step;
838
- const int8_t * s = x[i].scales;
839
-
840
- const float d = x[i+0].d;
841
-
842
- float sum = 0;
843
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
844
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
845
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
846
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
847
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
848
- }
849
- tmp += sum;
850
-
851
- }
852
-
853
- #endif
854
-
855
- // sum up partial sums and write back result
856
- #pragma unroll
857
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
858
- tmp +=
859
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
860
- }
861
-
862
- if (tid == 0) {
863
- dst[row] = tmp;
864
- }
865
- }
866
-
867
- static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
868
- float *dst, const int ncols,
869
- const int nrows,
870
- dpct::queue_ptr stream) {
871
- GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
872
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
873
- // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
874
- const sycl::range<3> block_nums(1, 1, block_num_y);
875
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
876
- {
877
- dpct::has_capability_or_fail(stream->get_device(),
878
- {sycl::aspect::fp16});
879
-
880
- stream->parallel_for(
881
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
882
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
883
- dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
884
- vx, y, dst, ncols, nrows, item_ct1);
885
- });
886
- }
887
- }
888
-
889
-
890
- static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
891
- float *dst, const int ncols,
892
- const int nrows,
893
- dpct::queue_ptr stream) {
894
- GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
895
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
896
- // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
897
- const sycl::range<3> block_nums(1, 1, block_num_y);
898
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
899
- {
900
- dpct::has_capability_or_fail(stream->get_device(),
901
- {sycl::aspect::fp16});
902
-
903
- stream->parallel_for(
904
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
905
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
906
- dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
907
- vx, y, dst, ncols, nrows, item_ct1);
908
- });
909
- }
910
- }
911
-
912
- static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
913
- float *dst, const int ncols,
914
- const int nrows,
915
- dpct::queue_ptr stream) {
916
- GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
917
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
918
- const sycl::range<3> block_nums(1, 1, block_num_y);
919
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
920
- {
921
- dpct::has_capability_or_fail(stream->get_device(),
922
- {sycl::aspect::fp16});
923
-
924
- stream->parallel_for(
925
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
926
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
927
- dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
928
- vx, y, dst, ncols, nrows, item_ct1);
929
- });
930
- }
931
- }
932
-
933
- static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
934
- float *dst, const int ncols,
935
- const int nrows,
936
- dpct::queue_ptr stream) {
937
- GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
938
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
939
- const sycl::range<3> block_nums(1, 1, block_num_y);
940
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
941
- {
942
- dpct::has_capability_or_fail(stream->get_device(),
943
- {sycl::aspect::fp16});
944
-
945
- stream->parallel_for(
946
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
947
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
948
- dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
949
- vx, y, dst, ncols, nrows, item_ct1);
950
- });
951
- }
952
- }
953
-
954
- static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
955
- float *dst, const int ncols,
956
- const int nrows,
957
- dpct::queue_ptr stream) {
958
- GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
959
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
960
- const sycl::range<3> block_nums(1, 1, block_num_y);
961
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
962
- {
963
- dpct::has_capability_or_fail(stream->get_device(),
964
- {sycl::aspect::fp16});
965
-
966
- stream->parallel_for(
967
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
968
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
969
- dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
970
- vx, y, dst, ncols, nrows, item_ct1);
971
- });
972
- }
973
- }
974
-
975
- static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
976
- float *dst, const int ncols,
977
- const int nrows,
978
- dpct::queue_ptr stream) {
979
- GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
980
- const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
981
- const sycl::range<3> block_nums(1, 1, block_num_y);
982
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
983
- {
984
- dpct::has_capability_or_fail(stream->get_device(),
985
- {sycl::aspect::fp16});
986
-
987
- stream->parallel_for(
988
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
989
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
990
- dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
991
- vx, y, dst, ncols, nrows, item_ct1);
992
- });
993
- }
994
- }
995
-
996
- static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
997
- float *dst, const int ncols,
998
- const int nrows,
999
- dpct::queue_ptr stream) {
1000
- GGML_ASSERT(ncols % QK_K == 0);
1001
- const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
1002
- const int block_num_y = (nrows + ny - 1) / ny;
1003
- const sycl::range<3> block_nums(1, 1, block_num_y);
1004
- const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1005
- stream->parallel_for(
1006
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1007
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1008
- dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
1009
- });
1010
- }
1011
-
1012
- static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
1013
- float *dst, const int ncols,
1014
- const int nrows,
1015
- dpct::queue_ptr stream) {
1016
- GGML_ASSERT(ncols % QK_K == 0);
1017
- const int ny = 2 / K_QUANTS_PER_ITERATION;
1018
- const int block_num_y = (nrows + ny - 1) / ny;
1019
- const sycl::range<3> block_nums(1, 1, block_num_y);
1020
- const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1021
- stream->parallel_for(
1022
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1023
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1024
- dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
1025
- });
1026
- }
1027
-
1028
- static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
1029
- float *dst, const int ncols,
1030
- const int nrows,
1031
- dpct::queue_ptr stream) {
1032
- GGML_ASSERT(ncols % QK_K == 0);
1033
- const int ny = 2 / K_QUANTS_PER_ITERATION;
1034
- const int block_num_y = (nrows + ny - 1) / ny;
1035
- const sycl::range<3> block_nums(1, 1, block_num_y);
1036
- const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1037
- stream->parallel_for(
1038
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1039
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1040
- dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
1041
- });
1042
- }
1043
-
1044
- static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
1045
- float *dst, const int ncols,
1046
- const int nrows,
1047
- dpct::queue_ptr stream) {
1048
- GGML_ASSERT(ncols % QK_K == 0);
1049
- const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
1050
- stream->parallel_for(
1051
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
1052
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1053
- dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
1054
- });
1055
- }
1056
-
1057
- static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
1058
- float *dst, const int ncols,
1059
- const int nrows,
1060
- dpct::queue_ptr stream) {
1061
- GGML_ASSERT(ncols % QK_K == 0);
1062
- const int ny = 2 / K_QUANTS_PER_ITERATION;
1063
- const int block_num_y = (nrows + ny - 1) / ny;
1064
- const sycl::range<3> block_nums(1, 1, block_num_y);
1065
- const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1066
- stream->parallel_for(
1067
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
1068
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1069
- dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
1070
- });
1071
- }
1072
-
1073
- void ggml_sycl_op_dequantize_mul_mat_vec(
1074
- ggml_backend_sycl_context & ctx,
1075
- const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
1076
- const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
1077
- float *dst_dd_i, const int64_t row_low, const int64_t row_high,
1078
- const int64_t src1_ncols, const int64_t src1_padded_row_size,
1079
- const dpct::queue_ptr &stream) {
1080
-
1081
- const int64_t ne00 = src0->ne[0];
1082
- const int64_t row_diff = row_high - row_low;
1083
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1084
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1085
- #ifdef GGML_SYCL_F16
1086
- ggml_sycl_pool_alloc<sycl::half> src1_dfloat_a(ctx.pool());
1087
- sycl::half *src1_dfloat = nullptr; // dfloat == half
1088
-
1089
- bool src1_convert_f16 =
1090
- src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
1091
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
1092
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1093
-
1094
- if (src1_convert_f16) {
1095
- src1_dfloat = src1_dfloat_a.alloc(ne00);
1096
- const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
1097
- GGML_ASSERT(to_fp16_sycl != nullptr);
1098
- to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
1099
- }
1100
- #else
1101
- const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
1102
- #endif // GGML_SYCL_F16
1103
-
1104
- switch (src0->type) {
1105
- case GGML_TYPE_Q4_0:
1106
- if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
1107
- ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
1108
- dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1109
- } else {
1110
- dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1111
- }
1112
- break;
1113
- case GGML_TYPE_Q4_1:
1114
- dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1115
- break;
1116
- case GGML_TYPE_Q5_0:
1117
- dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1118
- break;
1119
- case GGML_TYPE_Q5_1:
1120
- dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1121
- break;
1122
- case GGML_TYPE_Q8_0:
1123
- dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1124
- break;
1125
- case GGML_TYPE_Q2_K:
1126
- dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1127
- break;
1128
- case GGML_TYPE_Q3_K:
1129
- dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1130
- break;
1131
- case GGML_TYPE_Q4_K:
1132
- if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1133
- ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1134
- // reorder is currently not supported for dmmv
1135
- GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
1136
- } else {
1137
- dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1138
- }
1139
- break;
1140
- case GGML_TYPE_Q5_K:
1141
- dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1142
- break;
1143
- case GGML_TYPE_Q6_K:
1144
- dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1145
- break;
1146
- case GGML_TYPE_F16:
1147
- convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1148
- break;
1149
- default:
1150
- printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
1151
- GGML_ABORT("fatal error");
1152
- }
1153
-
1154
- GGML_UNUSED(src1);
1155
- GGML_UNUSED(dst);
1156
- GGML_UNUSED(src1_ddq_i);
1157
- GGML_UNUSED(src1_ncols);
1158
- GGML_UNUSED(src1_padded_row_size);
1159
- GGML_UNUSED(ctx);
1160
- }