@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,698 @@
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #ifndef GGML_SYCL_DEQUANTIZE_HPP
14
+ #define GGML_SYCL_DEQUANTIZE_HPP
15
+
16
+ #include "common.hpp"
17
+
18
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
19
+
20
+ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
21
+ const int iqs, dfloat2 &v) {
22
+ const block_q4_0 * x = (const block_q4_0 *) vx;
23
+
24
+ const dfloat d = x[ib].d;
25
+
26
+ const int vui = x[ib].qs[iqs];
27
+
28
+ v.x() = vui & 0xF;
29
+ v.y() = vui >> 4;
30
+
31
+ #ifdef GGML_SYCL_F16
32
+ // v = v - {8.0f, 8.0f};
33
+ // v = v * {d, d};
34
+ v.s0() = (v.s0() - 8.0f) * d;
35
+ v.s1() = (v.s1() - 8.0f) * d;
36
+
37
+ #else
38
+ v.x() = (v.x() - 8.0f) * d;
39
+ v.y() = (v.y() - 8.0f) * d;
40
+ #endif // GGML_SYCL_F16
41
+ }
42
+
43
+ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
44
+ const int iqs, dfloat2 &v) {
45
+ const block_q4_1 * x = (const block_q4_1 *) vx;
46
+
47
+ const dfloat d = x[ib].dm[0];
48
+ const dfloat m = x[ib].dm[1];
49
+
50
+ const int vui = x[ib].qs[iqs];
51
+
52
+ v.x() = vui & 0xF;
53
+ v.y() = vui >> 4;
54
+
55
+ #ifdef GGML_SYCL_F16
56
+ // v = v * {d, d};
57
+ // v = v + {m, m};
58
+ v.s0() = (v.s0() * d) + m;
59
+ v.s1() = (v.s1() * d) + m;
60
+
61
+ #else
62
+ v.x() = (v.x() * d) + m;
63
+ v.y() = (v.y() * d) + m;
64
+ #endif // GGML_SYCL_F16
65
+ }
66
+
67
+ static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
68
+ const int iqs, dfloat2 &v) {
69
+ const block_q5_0 * x = (const block_q5_0 *) vx;
70
+
71
+ const dfloat d = x[ib].d;
72
+
73
+ uint32_t qh;
74
+ memcpy(&qh, x[ib].qh, sizeof(qh));
75
+
76
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
77
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
78
+
79
+ v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
80
+ v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
81
+
82
+ #ifdef GGML_SYCL_F16
83
+ // v = v - {16.0f, 16.0f};
84
+ // v = v * {d, d};
85
+ v.s0() = (v.s0() - 16.0f) * d;
86
+ v.s1() = (v.s1() - 16.0f) * d;
87
+
88
+ #else
89
+ v.x() = (v.x() - 16.0f) * d;
90
+ v.y() = (v.y() - 16.0f) * d;
91
+ #endif // GGML_SYCL_F16
92
+ }
93
+
94
+ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
95
+ const int iqs, dfloat2 &v) {
96
+ const block_q5_1 * x = (const block_q5_1 *) vx;
97
+
98
+ const dfloat d = x[ib].dm[0];
99
+ const dfloat m = x[ib].dm[1];
100
+
101
+ uint32_t qh;
102
+ memcpy(&qh, x[ib].qh, sizeof(qh));
103
+
104
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
105
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
106
+
107
+ v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
108
+ v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
109
+
110
+ #ifdef GGML_SYCL_F16
111
+ // v = v * {d, d};
112
+ // v = v + {m, m};
113
+ v.s0() = (v.s0() * d) + m;
114
+ v.s1() = (v.s1() * d) + m;
115
+ #else
116
+ v.x() = (v.x() * d) + m;
117
+ v.y() = (v.y() * d) + m;
118
+ #endif // GGML_SYCL_F16
119
+ }
120
+
121
+ static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
122
+ const int iqs, dfloat2 &v) {
123
+ const block_q8_0 * x = (const block_q8_0 *) vx;
124
+
125
+ const dfloat d = x[ib].d;
126
+
127
+ v.x() = x[ib].qs[iqs + 0];
128
+ v.y() = x[ib].qs[iqs + 1];
129
+
130
+ #ifdef GGML_SYCL_F16
131
+ // v = v * {d, d};
132
+ v.s0() *= d;
133
+ v.s1() *= d;
134
+ #else
135
+ v.x() *= d;
136
+ v.y() *= d;
137
+ #endif // GGML_SYCL_F16
138
+ }
139
+
140
+ template<typename dst_t>
141
+ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
142
+ const sycl::nd_item<3> &item_ct1) {
143
+
144
+ const int i = item_ct1.get_group(2);
145
+
146
+ // assume 32 threads
147
+ const int tid = item_ct1.get_local_id(2);
148
+ const int il = tid/8;
149
+ const int ir = tid%8;
150
+ const int ib = 8*i + ir;
151
+ if (ib >= nb32) {
152
+ return;
153
+ }
154
+
155
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
156
+
157
+ const block_q4_0 * x = (const block_q4_0 *)vx + ib;
158
+ const float d = sycl::vec<sycl::half, 1>(x->d)
159
+ .convert<float, sycl::rounding_mode::automatic>()[0];
160
+ const float dm = -8*d;
161
+
162
+ const uint8_t * q = x->qs + 4*il;
163
+
164
+ for (int l = 0; l < 4; ++l) {
165
+ y[l+ 0] = d * (q[l] & 0xF) + dm;
166
+ y[l+16] = d * (q[l] >> 4) + dm;
167
+ }
168
+ }
169
+
170
+ template<typename dst_t>
171
+ static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
172
+ const sycl::nd_item<3> &item_ct1) {
173
+
174
+ const int i = item_ct1.get_group(2);
175
+
176
+ // assume 32 threads
177
+ const int tid = item_ct1.get_local_id(2);
178
+ const int il = tid/8;
179
+ const int ir = tid%8;
180
+ const int ib = 8*i + ir;
181
+ if (ib >= nb32) {
182
+ return;
183
+ }
184
+
185
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
186
+
187
+ const block_q4_1 * x = (const block_q4_1 *)vx + ib;
188
+ const sycl::float2 d =
189
+ x->dm.convert<float, sycl::rounding_mode::automatic>();
190
+
191
+ const uint8_t * q = x->qs + 4*il;
192
+
193
+ for (int l = 0; l < 4; ++l) {
194
+ y[l + 0] = d.x() * (q[l] & 0xF) + d.y();
195
+ y[l + 16] = d.x() * (q[l] >> 4) + d.y();
196
+ }
197
+ }
198
+
199
+
200
+ //================================== k-quants
201
+
202
+ template<typename dst_t>
203
+ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
204
+ const sycl::nd_item<3> &item_ct1) {
205
+
206
+ const int i = item_ct1.get_group(2);
207
+ const block_q2_K * x = (const block_q2_K *) vx;
208
+
209
+ const int tid = item_ct1.get_local_id(2);
210
+ #if QK_K == 256
211
+ const int n = tid/32;
212
+ const int l = tid - 32*n;
213
+ const int is = 8*n + l/16;
214
+
215
+ const uint8_t q = x[i].qs[32*n + l];
216
+ dst_t * y = yy + i*QK_K + 128*n;
217
+
218
+ float dall = x[i].dm[0];
219
+ float dmin = x[i].dm[1];
220
+ y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
221
+ y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
222
+ y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
223
+ y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
224
+ #else
225
+ const int is = tid/16; // 0 or 1
226
+ const int il = tid%16; // 0...15
227
+ const uint8_t q = x[i].qs[il] >> (2*is);
228
+ dst_t * y = yy + i*QK_K + 16*is + il;
229
+
230
+ float dall = x[i].dm[0];
231
+ float dmin = x[i].dm[1];
232
+ y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
233
+ y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
234
+ #endif
235
+
236
+ }
237
+
238
+ template<typename dst_t>
239
+ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
240
+ const sycl::nd_item<3> &item_ct1) {
241
+
242
+ const int i = item_ct1.get_group(2);
243
+ const block_q3_K * x = (const block_q3_K *) vx;
244
+
245
+ #if QK_K == 256
246
+ const int r = item_ct1.get_local_id(2) / 4;
247
+ const int tid = r/2;
248
+ const int is0 = r%2;
249
+ const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
250
+ const int n = tid / 4;
251
+ const int j = tid - 4*n;
252
+
253
+ uint8_t m = 1 << (4*n + j);
254
+ int is = 8*n + 2*j + is0;
255
+ int shift = 2*j;
256
+
257
+ int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
258
+ is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
259
+ is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
260
+ (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
261
+ float d_all = x[i].d;
262
+ float dl = d_all * (us - 32);
263
+
264
+ dst_t * y = yy + i*QK_K + 128*n + 32*j;
265
+ const uint8_t * q = x[i].qs + 32*n;
266
+ const uint8_t * hm = x[i].hmask;
267
+
268
+ for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
269
+ #else
270
+ const int tid = item_ct1.get_local_id(2);
271
+ const int is = tid/16; // 0 or 1
272
+ const int il = tid%16; // 0...15
273
+ const int im = il/8; // 0...1
274
+ const int in = il%8; // 0...7
275
+
276
+ dst_t * y = yy + i*QK_K + 16*is + il;
277
+
278
+ const uint8_t q = x[i].qs[il] >> (2*is);
279
+ const uint8_t h = x[i].hmask[in] >> (2*is + im);
280
+ const float d = (float)x[i].d;
281
+
282
+ if (is == 0) {
283
+ y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
284
+ y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
285
+ } else {
286
+ y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
287
+ y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
288
+ }
289
+ #endif
290
+
291
+ }
292
+
293
+ #if QK_K == 256
294
+ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
295
+ if (j < 4) {
296
+ d = q[j] & 63;
297
+ m = q[j + 4] & 63;
298
+ } else {
299
+ d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
300
+ m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
301
+ }
302
+ }
303
+ #endif
304
+
305
+ template<typename dst_t>
306
+ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
307
+ uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
308
+ const block_q4_K * x = (const block_q4_K *) vx;
309
+
310
+ const int i = item_ct1.get_group(2);
311
+
312
+ #if QK_K == 256
313
+ // assume 32 threads
314
+ const int tid = item_ct1.get_local_id(2);
315
+ const int il = tid/8;
316
+ const int ir = tid%8;
317
+ const int is = 2*il;
318
+ const int n = 4;
319
+
320
+ dst_t * y = yy + i*QK_K + 64*il + n*ir;
321
+
322
+ const sycl::half2 dm = x[i].dm;
323
+ const float dall = dm[0];
324
+ const float dmin = dm[1];
325
+
326
+ if (tid < 12)
327
+ scales_local[tid] = x[i].scales[tid];
328
+ item_ct1.barrier(sycl::access::fence_space::local_space);
329
+
330
+ uint8_t sc, m;
331
+ get_scale_min_k4(is + 0, scales_local, sc, m);
332
+ const float d1 = dall * sc;
333
+ const float m1 = dmin * m;
334
+ get_scale_min_k4(is + 1, scales_local, sc, m);
335
+ const float d2 = dall * sc;
336
+ const float m2 = dmin * m;
337
+
338
+ sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
339
+ for (int l = 0; l < n; ++l) {
340
+ y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
341
+ y[l +32] = d2 * (q_vec[l] >> 4) - m2;
342
+ }
343
+ #else
344
+ const int tid = item_ct1.get_local_id(2);
345
+ const uint8_t * q = x[i].qs;
346
+ dst_t * y = yy + i*QK_K;
347
+ const float d = (float)x[i].dm[0];
348
+ const float m = (float)x[i].dm[1];
349
+ y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
350
+ y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
351
+ #endif
352
+ }
353
+
354
+ template<typename dst_t>
355
+ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
356
+ const sycl::nd_item<3> &item_ct1) {
357
+ const block_q5_K * x = (const block_q5_K *) vx;
358
+
359
+ const int i = item_ct1.get_group(2);
360
+
361
+ #if QK_K == 256
362
+ // assume 64 threads - this is very slightly better than the one below
363
+ const int tid = item_ct1.get_local_id(2);
364
+ const int il = tid/16; // il is in 0...3
365
+ const int ir = tid%16; // ir is in 0...15
366
+ const int is = 2*il; // is is in 0...6
367
+
368
+ dst_t * y = yy + i*QK_K + 64*il + 2*ir;
369
+
370
+ const float dall = x[i].dm[0];
371
+ const float dmin = x[i].dm[1];
372
+
373
+ const uint8_t * ql = x[i].qs + 32*il + 2*ir;
374
+ const uint8_t * qh = x[i].qh + 2*ir;
375
+
376
+ uint8_t sc, m;
377
+ get_scale_min_k4(is + 0, x[i].scales, sc, m);
378
+ const float d1 = dall * sc; const float m1 = dmin * m;
379
+ get_scale_min_k4(is + 1, x[i].scales, sc, m);
380
+ const float d2 = dall * sc; const float m2 = dmin * m;
381
+
382
+ uint8_t hm = 1 << (2*il);
383
+ y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
384
+ y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
385
+ hm <<= 1;
386
+ y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
387
+ y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
388
+ #else
389
+ const int tid = item_ct1.get_local_id(2);
390
+ const uint8_t q = x[i].qs[tid];
391
+ const int im = tid/8; // 0...3
392
+ const int in = tid%8; // 0...7
393
+ const int is = tid/16; // 0 or 1
394
+ const uint8_t h = x[i].qh[in] >> im;
395
+ const float d = x[i].d;
396
+ dst_t * y = yy + i*QK_K + tid;
397
+ y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
398
+ y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
399
+ #endif
400
+ }
401
+
402
+ template<typename dst_t>
403
+ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
404
+ const sycl::nd_item<3> &item_ct1) {
405
+ const block_q6_K * x = (const block_q6_K *) vx;
406
+
407
+ const int i = item_ct1.get_group(2);
408
+ #if QK_K == 256
409
+
410
+ // assume 64 threads - this is very slightly better than the one below
411
+ const int tid = item_ct1.get_local_id(2);
412
+ const int ip = tid/32; // ip is 0 or 1
413
+ const int il = tid - 32*ip; // 0...32
414
+ const int is = 8*ip + il/16;
415
+
416
+ dst_t * y = yy + i*QK_K + 128*ip + il;
417
+
418
+ const float d = x[i].d;
419
+
420
+ const uint8_t * ql = x[i].ql + 64*ip + il;
421
+ const uint8_t qh = x[i].qh[32*ip + il];
422
+ const int8_t * sc = x[i].scales + is;
423
+
424
+ y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
425
+ y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
426
+ y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
427
+ y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
428
+ #else
429
+
430
+ // assume 32 threads
431
+ const int tid = item_ct1.get_local_id(2);
432
+ const int ip = tid/16; // 0 or 1
433
+ const int il = tid - 16*ip; // 0...15
434
+
435
+ dst_t * y = yy + i*QK_K + 16*ip + il;
436
+
437
+ const float d = x[i].d;
438
+
439
+ const uint8_t ql = x[i].ql[16*ip + il];
440
+ const uint8_t qh = x[i].qh[il] >> (2*ip);
441
+ const int8_t * sc = x[i].scales;
442
+
443
+ y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
444
+ y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
445
+ #endif
446
+ }
447
+
448
+ template<typename dst_t>
449
+ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
450
+ const sycl::nd_item<3> &item_ct1,
451
+ const uint64_t *iq2xxs_grid_ptr,
452
+ const uint8_t *ksigns_iq2xs_ptr,
453
+ const uint8_t *kmask_iq2xs_ptr) {
454
+
455
+ const int i = item_ct1.get_group(2);
456
+ const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
457
+
458
+ const int tid = item_ct1.get_local_id(2);
459
+ #if QK_K == 256
460
+ const int il = tid/8; // 0...3
461
+ const int ib = tid%8; // 0...7
462
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
463
+ const uint16_t * q2 = x[i].qs + 4*ib;
464
+ const uint8_t * aux8 = (const uint8_t *)q2;
465
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]);
466
+ const uint32_t aux32 = q2[2] | (q2[3] << 16);
467
+ const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
468
+ const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
469
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
470
+ #else
471
+ assert(false);
472
+ #endif
473
+
474
+ }
475
+
476
+ template<typename dst_t>
477
+ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy,
478
+ const sycl::nd_item<3> &item_ct1,
479
+ const uint64_t *iq2xs_grid,
480
+ const uint8_t *ksigns_iq2xs,
481
+ const uint8_t *kmask_iq2xs) {
482
+
483
+ const int i = item_ct1.get_group(2);
484
+ const block_iq2_xs * x = (const block_iq2_xs *) vx;
485
+
486
+ const int tid = item_ct1.get_local_id(2);
487
+ #if QK_K == 256
488
+ const int il = tid/8; // 0...3
489
+ const int ib = tid%8; // 0...7
490
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
491
+ const uint16_t * q2 = x[i].qs + 4*ib;
492
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
493
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
494
+ const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
495
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
496
+ #else
497
+ assert(false);
498
+ #endif
499
+
500
+ }
501
+
502
+ template <typename dst_t>
503
+ __dpct_inline__ static void
504
+ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
505
+ const sycl::nd_item<3> &item_ct1) {
506
+
507
+ const int i = item_ct1.get_group(2);
508
+ const block_iq2_s * x = (const block_iq2_s *) vx;
509
+
510
+ const int tid = item_ct1.get_local_id(2);
511
+ #if QK_K == 256
512
+ const int il = tid/8; // 0...3
513
+ const int ib = tid%8; // 0...7
514
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
515
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
516
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
517
+ const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
518
+ #pragma unroll
519
+ for (int j = 0; j < 8; ++j)
520
+ y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
521
+ #else
522
+ assert(false);
523
+
524
+ #endif
525
+
526
+ }
527
+
528
+ template<typename dst_t>
529
+ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
530
+ const sycl::nd_item<3> &item_ct1,
531
+ const uint32_t *iq3xxs_grid,
532
+ const uint8_t *ksigns_iq2xs,
533
+ const uint8_t *kmask_iq2xs) {
534
+
535
+ const int i = item_ct1.get_group(2);
536
+ const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
537
+
538
+ const int tid = item_ct1.get_local_id(2);
539
+ #if QK_K == 256
540
+ const int il = tid/8; // 0...3
541
+ const int ib = tid%8; // 0...7
542
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
543
+ const uint8_t * q3 = x[i].qs + 8*ib;
544
+ const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
545
+ const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
546
+ const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
547
+ const uint32_t aux32 = gas[0] | (gas[1] << 16);
548
+ const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
549
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
550
+ for (int j = 0; j < 4; ++j) {
551
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
552
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
553
+ }
554
+ #else
555
+ assert(false);
556
+ #endif
557
+
558
+ }
559
+
560
+ template <typename dst_t>
561
+ __dpct_inline__ static void
562
+ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
563
+ const sycl::nd_item<3> &item_ct1,
564
+ const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
565
+
566
+ const int i = item_ct1.get_group(2);
567
+ const block_iq3_s * x = (const block_iq3_s *) vx;
568
+
569
+ const int tid = item_ct1.get_local_id(2);
570
+ #if QK_K == 256
571
+ const int il = tid/8; // 0...3
572
+ const int ib = tid%8; // 0...7
573
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
574
+ const uint8_t * qs = x[i].qs + 8*ib;
575
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
576
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
577
+ const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
578
+ const uint8_t signs = x[i].signs[4*ib + il];
579
+ #pragma unroll
580
+ for (int j = 0; j < 4; ++j) {
581
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
582
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
583
+ }
584
+ #else
585
+ assert(false);
586
+ #endif
587
+
588
+ }
589
+
590
+ template <typename dst_t>
591
+ __dpct_inline__ static void
592
+ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
593
+ const sycl::nd_item<3> &item_ct1,
594
+ const uint32_t *iq1s_grid_gpu) {
595
+
596
+ const int i = item_ct1.get_group(2);
597
+ const block_iq1_s * x = (const block_iq1_s *) vx;
598
+
599
+ const int tid = item_ct1.get_local_id(2);
600
+ #if QK_K == 256
601
+ const int il = tid/8; // 0...3
602
+ const int ib = tid%8; // 0...7
603
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
604
+ const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
605
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
606
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
607
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
608
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
609
+ grid32[0] &= 0x0f0f0f0f;
610
+ #pragma unroll
611
+ for (int j = 0; j < 8; ++j) {
612
+ y[j] = d * (q[j] + delta);
613
+ }
614
+ #else
615
+ assert(false);
616
+ #endif
617
+
618
+ }
619
+
620
+ template <typename dst_t>
621
+ __dpct_inline__ static void
622
+ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
623
+ const sycl::nd_item<3> &item_ct1,
624
+ const uint32_t *iq1s_grid_gpu) {
625
+
626
+ const int i = item_ct1.get_group(2);
627
+ const block_iq1_m * x = (const block_iq1_m *) vx;
628
+
629
+ const int tid = item_ct1.get_local_id(2);
630
+ #if QK_K == 256
631
+ const int il = tid/8; // 0...3
632
+ const int ib = tid%8; // 0...7
633
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
634
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
635
+ iq1m_scale_t scale;
636
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
637
+ const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
638
+ const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
639
+ const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
640
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
641
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
642
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
643
+ grid32[0] &= 0x0f0f0f0f;
644
+ #pragma unroll
645
+ for (int j = 0; j < 8; ++j) {
646
+ y[j] = d * (q[j] + delta);
647
+ }
648
+ #else
649
+ assert(false);
650
+ #endif
651
+
652
+ }
653
+
654
+ template <typename dst_t>
655
+ __dpct_inline__ static void
656
+ dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
657
+ const sycl::nd_item<3> &item_ct1) {
658
+
659
+ const int i = item_ct1.get_group(2);
660
+ const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
661
+
662
+ const int tid = item_ct1.get_local_id(2);
663
+ const int il = tid/8; // 0...3
664
+ const int ib = tid%8; // 0...7
665
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
666
+ const uint8_t * q4 = x[ib].qs + 4*il;
667
+ const float d = (float)x[ib].d;
668
+ #pragma unroll
669
+ for (int j = 0; j < 4; ++j) {
670
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
671
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
672
+ }
673
+
674
+ }
675
+
676
+
677
+ template <typename dst_t>
678
+ __dpct_inline__ static void
679
+ dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
680
+ const sycl::nd_item<3> &item_ct1) {
681
+ const int i = item_ct1.get_group(2);
682
+ const block_iq4_xs * x = (const block_iq4_xs *)vx;
683
+
684
+ const int tid = item_ct1.get_local_id(2);
685
+ const int il = tid/8; // 0...3
686
+ const int ib = tid%8; // 0...7
687
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
688
+ const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
689
+ const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
690
+ #pragma unroll
691
+ for (int j = 0; j < 4; ++j) {
692
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
693
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
694
+ }
695
+ }
696
+
697
+
698
+ #endif // GGML_SYCL_DEQUANTIZE_HPP