@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,27 @@
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #ifndef GGML_SYCL_MMVQ_HPP
14
+ #define GGML_SYCL_MMVQ_HPP
15
+
16
+ #include "common.hpp"
17
+
18
+
19
+ void ggml_sycl_op_mul_mat_vec_q(
20
+ ggml_backend_sycl_context & ctx,
21
+ const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
22
+ const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
23
+ float *dst_dd_i, const int64_t row_low, const int64_t row_high,
24
+ const int64_t src1_ncols, const int64_t src1_padded_row_size,
25
+ const dpct::queue_ptr &stream);
26
+
27
+ #endif // GGML_SYCL_MMVQ_HPP
@@ -0,0 +1,374 @@
1
+ #include "norm.hpp"
2
+
3
+ static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
4
+ const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
5
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
6
+ item_ct1.get_local_id(1);
7
+ const int tid = item_ct1.get_local_id(2);
8
+
9
+ const int nthreads = item_ct1.get_local_range(2);
10
+ const int nwarps = nthreads / WARP_SIZE;
11
+ assert(nwarps % WARP_SIZE == 0);
12
+ sycl::float2 mean_var = sycl::float2(0.f, 0.f);
13
+
14
+ for (int col = tid; col < ncols; col += block_size) {
15
+ const float xi = x[row * ncols + col];
16
+ mean_var.x() += xi;
17
+ mean_var.y() += xi * xi;
18
+ }
19
+
20
+ // sum up partial sums
21
+ mean_var = warp_reduce_sum(mean_var, item_ct1);
22
+ if (block_size > WARP_SIZE) {
23
+
24
+ int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
25
+ int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
26
+ if (lane_id == 0) {
27
+ s_sum[warp_id] = mean_var;
28
+ }
29
+ /*
30
+ DPCT1118:0: SYCL group functions and algorithms must be encountered in
31
+ converged control flow. You may need to adjust the code.
32
+ */
33
+ item_ct1.barrier(sycl::access::fence_space::local_space);
34
+ mean_var = 0.f;
35
+ int nreduce = nwarps / WARP_SIZE;
36
+ for (size_t i = 0; i < nreduce; i += 1)
37
+ {
38
+ mean_var += s_sum[lane_id + i * WARP_SIZE];
39
+ }
40
+ mean_var = warp_reduce_sum(mean_var, item_ct1);
41
+ }
42
+
43
+ const float mean = mean_var.x() / ncols;
44
+ const float var = mean_var.y() / ncols - mean * mean;
45
+ const float inv_std = sycl::rsqrt(var + eps);
46
+
47
+ for (int col = tid; col < ncols; col += block_size) {
48
+ dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
49
+ }
50
+ }
51
+
52
+ static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps,
53
+ const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
54
+ int start = item_ct1.get_group(2) * group_size;
55
+ int end = start + group_size;
56
+ const int nthreads = item_ct1.get_local_range(2);
57
+ const int nwarps = nthreads / WARP_SIZE;
58
+ assert(nwarps % WARP_SIZE == 0);
59
+ start += item_ct1.get_local_id(2);
60
+ int nreduce = nwarps / WARP_SIZE;
61
+
62
+ if (end >= ne_elements) {
63
+ end = ne_elements;
64
+ }
65
+
66
+ float tmp = 0.0f; // partial sum for thread in warp
67
+
68
+ for (int j = start; j < end; j += block_size) {
69
+ tmp += x[j];
70
+ }
71
+
72
+ tmp = warp_reduce_sum(tmp, item_ct1);
73
+ if (block_size > WARP_SIZE) {
74
+
75
+ int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
76
+ int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
77
+ if (lane_id == 0) {
78
+ s_sum[warp_id] = tmp;
79
+ }
80
+ /*
81
+ DPCT1118:1: SYCL group functions and algorithms must be encountered in
82
+ converged control flow. You may need to adjust the code.
83
+ */
84
+ /*
85
+ DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
86
+ sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
87
+ better performance if there is no access to global memory.
88
+ */
89
+ item_ct1.barrier();
90
+ tmp = 0.f;
91
+ for (size_t i = 0; i < nreduce; i += 1)
92
+ {
93
+ tmp += s_sum[lane_id + i * WARP_SIZE];
94
+ }
95
+ tmp = warp_reduce_sum(tmp, item_ct1);
96
+ }
97
+
98
+ float mean = tmp / group_size;
99
+ tmp = 0.0f;
100
+
101
+ for (int j = start; j < end; j += block_size) {
102
+ float xi = x[j] - mean;
103
+ dst[j] = xi;
104
+ tmp += xi * xi;
105
+ }
106
+
107
+ tmp = warp_reduce_sum(tmp, item_ct1);
108
+ if (block_size > WARP_SIZE) {
109
+
110
+ int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
111
+ int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
112
+ if (lane_id == 0) {
113
+ s_sum[warp_id] = tmp;
114
+ }
115
+ /*
116
+ DPCT1118:2: SYCL group functions and algorithms must be encountered in
117
+ converged control flow. You may need to adjust the code.
118
+ */
119
+ /*
120
+ DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
121
+ sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
122
+ better performance if there is no access to global memory.
123
+ */
124
+ item_ct1.barrier();
125
+ tmp = 0.f;
126
+ for (size_t i = 0; i < nreduce; i += 1)
127
+ {
128
+ tmp += s_sum[lane_id + i * WARP_SIZE];
129
+ }
130
+ tmp = warp_reduce_sum(tmp, item_ct1);
131
+ }
132
+
133
+ float variance = tmp / group_size;
134
+ float scale = sycl::rsqrt(variance + eps);
135
+ for (int j = start; j < end; j += block_size) {
136
+ dst[j] *= scale;
137
+ }
138
+ }
139
+
140
+ static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
141
+ const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
142
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
143
+ item_ct1.get_local_id(1);
144
+ const int tid = item_ct1.get_local_id(2);
145
+ const int nthreads = item_ct1.get_local_range(2);
146
+ const int nwarps = nthreads / WARP_SIZE;
147
+ assert(nwarps % WARP_SIZE == 0);
148
+ float tmp = 0.0f; // partial sum for thread in warp
149
+
150
+ for (int col = tid; col < ncols; col += block_size) {
151
+ const float xi = x[row * ncols + col];
152
+ tmp += xi * xi;
153
+ }
154
+
155
+ // sum up partial sums
156
+ tmp = warp_reduce_sum(tmp, item_ct1);
157
+ if (block_size > WARP_SIZE) {
158
+
159
+ int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
160
+ int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
161
+ if (lane_id == 0) {
162
+ s_sum[warp_id] = tmp;
163
+ }
164
+ /*
165
+ DPCT1118:3: SYCL group functions and algorithms must be encountered in
166
+ converged control flow. You may need to adjust the code.
167
+ */
168
+ item_ct1.barrier(sycl::access::fence_space::local_space);
169
+ int nreduce = nwarps / WARP_SIZE;
170
+ tmp = 0.f;
171
+ for (size_t i = 0; i < nreduce; i += 1)
172
+ {
173
+ tmp += s_sum[lane_id + i * WARP_SIZE];
174
+ }
175
+ tmp = warp_reduce_sum(tmp, item_ct1);
176
+ }
177
+
178
+ const float mean = tmp / ncols;
179
+ const float scale = sycl::rsqrt(mean + eps);
180
+
181
+ for (int col = tid; col < ncols; col += block_size) {
182
+ dst[row * ncols + col] = scale * x[row * ncols + col];
183
+ }
184
+ }
185
+
186
+ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
187
+ const int nrows, const float eps,
188
+ queue_ptr stream, int device) {
189
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
190
+ if (ncols < 1024) {
191
+ const sycl::range<3> block_dims(1, 1, WARP_SIZE);
192
+ stream->submit([&](sycl::handler& cgh) {
193
+ cgh.parallel_for(
194
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
195
+ block_dims),
196
+ [=](sycl::nd_item<3> item_ct1)
197
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
198
+ norm_f32(x, dst, ncols, eps, item_ct1,
199
+ nullptr, WARP_SIZE);
200
+ });
201
+ });
202
+ }
203
+ else {
204
+ const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
205
+ const sycl::range<3> block_dims(1, 1, work_group_size);
206
+ /*
207
+ DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
208
+ the limit. To get the device limit, query
209
+ info::device::max_work_group_size. Adjust the work-group size if needed.
210
+ */
211
+ stream->submit([&](sycl::handler& cgh) {
212
+ sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
213
+ sycl::range<1>(work_group_size / WARP_SIZE), cgh);
214
+
215
+ cgh.parallel_for(
216
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
217
+ block_dims),
218
+ [=](sycl::nd_item<3> item_ct1)
219
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
220
+ norm_f32(x, dst, ncols, eps, item_ct1,
221
+ get_pointer(s_sum_acc_ct1), work_group_size);
222
+ });
223
+ });
224
+ }
225
+ }
226
+
227
+ static void group_norm_f32_sycl(const float* x, float* dst,
228
+ const int num_groups, const int group_size,
229
+ const int ne_elements, queue_ptr stream, int device) {
230
+ static const float eps = 1e-6f;
231
+ if (group_size < 1024) {
232
+ const sycl::range<3> block_dims(1, 1, WARP_SIZE);
233
+ stream->submit([&](sycl::handler& cgh) {
234
+ const float eps_ct4 = eps;
235
+ cgh.parallel_for(
236
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
237
+ block_dims),
238
+ [=](sycl::nd_item<3> item_ct1)
239
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
240
+ group_norm_f32(
241
+ x, dst, group_size, ne_elements, eps_ct4, item_ct1,
242
+ nullptr, WARP_SIZE);
243
+ });
244
+ });
245
+ }
246
+ else {
247
+ const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
248
+ const sycl::range<3> block_dims(1, 1, work_group_size);
249
+ /*
250
+ DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
251
+ the limit. To get the device limit, query
252
+ info::device::max_work_group_size. Adjust the work-group size if needed.
253
+ */
254
+
255
+ stream->submit([&](sycl::handler& cgh) {
256
+ sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
257
+ cgh);
258
+
259
+ const float eps_ct4 = eps;
260
+
261
+ cgh.parallel_for(
262
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
263
+ block_dims),
264
+ [=](sycl::nd_item<3> item_ct1)
265
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
266
+ group_norm_f32(x, dst, group_size, ne_elements,
267
+ eps_ct4, item_ct1,
268
+ get_pointer(s_sum_acc_ct1), work_group_size);
269
+ });
270
+ });
271
+ }
272
+ }
273
+
274
+ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
275
+ const int nrows, const float eps,
276
+ queue_ptr stream, int device) {
277
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
278
+ // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
279
+ if (ncols < 1024) {
280
+ const sycl::range<3> block_dims(1, 1, WARP_SIZE);
281
+ stream->submit([&](sycl::handler& cgh) {
282
+ cgh.parallel_for(
283
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
284
+ block_dims),
285
+ [=](sycl::nd_item<3> item_ct1)
286
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
287
+ rms_norm_f32(x, dst, ncols, eps, item_ct1,
288
+ nullptr, WARP_SIZE);
289
+ });
290
+ });
291
+ }
292
+ else {
293
+ const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
294
+ const sycl::range<3> block_dims(1, 1, work_group_size);
295
+ /*
296
+ DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
297
+ the limit. To get the device limit, query
298
+ info::device::max_work_group_size. Adjust the work-group size if needed.
299
+ */
300
+ stream->submit([&](sycl::handler& cgh) {
301
+ sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
302
+ cgh);
303
+ cgh.parallel_for(
304
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
305
+ block_dims),
306
+ [=](sycl::nd_item<3> item_ct1)
307
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
308
+ rms_norm_f32(x, dst, ncols, eps, item_ct1,
309
+ get_pointer(s_sum_acc_ct1), work_group_size);
310
+ });
311
+ });
312
+ }
313
+ }
314
+
315
+ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
316
+ ggml_tensor* dst, const float* src0_dd,
317
+ const float* src1_dd, float* dst_dd,
318
+ const queue_ptr& main_stream) {
319
+
320
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
321
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
322
+
323
+ const int64_t ne00 = src0->ne[0];
324
+ const int64_t nrows = ggml_nrows(src0);
325
+
326
+ float eps;
327
+ memcpy(&eps, dst->op_params, sizeof(float));
328
+
329
+ norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
330
+
331
+ (void)src1;
332
+ (void)dst;
333
+ (void)src1_dd;
334
+ }
335
+
336
+ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
337
+ const ggml_tensor* src1, ggml_tensor* dst,
338
+ const float* src0_dd, const float* src1_dd,
339
+ float* dst_dd,
340
+ const queue_ptr& main_stream) {
341
+
342
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
343
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
344
+
345
+ int num_groups = dst->op_params[0];
346
+ int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
347
+ group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
348
+
349
+ (void)src1;
350
+ (void)dst;
351
+ (void)src1_dd;
352
+ }
353
+
354
+ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
355
+ const ggml_tensor* src1, ggml_tensor* dst,
356
+ const float* src0_dd, const float* src1_dd,
357
+ float* dst_dd,
358
+ const queue_ptr& main_stream) {
359
+
360
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
361
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
362
+
363
+ const int64_t ne00 = src0->ne[0];
364
+ const int64_t nrows = ggml_nrows(src0);
365
+
366
+ float eps;
367
+ memcpy(&eps, dst->op_params, sizeof(float));
368
+
369
+ rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
370
+
371
+ (void)src1;
372
+ (void)dst;
373
+ (void)src1_dd;
374
+ }
@@ -0,0 +1,35 @@
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #ifndef GGML_SYCL_NORM_HPP
14
+ #define GGML_SYCL_NORM_HPP
15
+
16
+ #include "common.hpp"
17
+
18
+ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
19
+ ggml_tensor* dst, const float* src0_dd,
20
+ const float* src1_dd, float* dst_dd,
21
+ const queue_ptr& main_stream);
22
+
23
+ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
24
+ const ggml_tensor* src1, ggml_tensor* dst,
25
+ const float* src0_dd, const float* src1_dd,
26
+ float* dst_dd,
27
+ const queue_ptr& main_stream);
28
+
29
+ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
30
+ const ggml_tensor* src1, ggml_tensor* dst,
31
+ const float* src0_dd, const float* src1_dd,
32
+ float* dst_dd,
33
+ const queue_ptr& main_stream);
34
+
35
+ #endif // GGML_SYCL_NORM_HPP
@@ -0,0 +1,66 @@
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #ifndef GGML_SYCL_PRESETS_HPP
14
+ #define GGML_SYCL_PRESETS_HPP
15
+
16
+ #define GGML_SYCL_MAX_STREAMS 8
17
+ #define GGML_SYCL_MAX_BUFFERS 256
18
+
19
+ #define WARP_SIZE GGML_SYCL_WARP_SIZE
20
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
21
+
22
+ #define SYCL_GELU_BLOCK_SIZE 256
23
+ #define SYCL_SILU_BLOCK_SIZE 256
24
+ #define SYCL_TANH_BLOCK_SIZE 256
25
+ #define SYCL_RELU_BLOCK_SIZE 256
26
+ #define SYCL_HARDSIGMOID_BLOCK_SIZE 256
27
+ #define SYCL_HARDSWISH_BLOCK_SIZE 256
28
+ #define SYCL_SQR_BLOCK_SIZE 256
29
+ #define SYCL_CPY_BLOCK_SIZE 32
30
+ #define SYCL_SCALE_BLOCK_SIZE 256
31
+ #define SYCL_CLAMP_BLOCK_SIZE 256
32
+ #define SYCL_ROPE_BLOCK_SIZE 256
33
+ #define SYCL_ALIBI_BLOCK_SIZE 32
34
+ #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
35
+ #define SYCL_QUANTIZE_BLOCK_SIZE 256
36
+ #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
37
+ #define SYCL_GET_ROWS_BLOCK_SIZE 256
38
+ #define SYCL_UPSCALE_BLOCK_SIZE 256
39
+ #define SYCL_CONCAT_BLOCK_SIZE 256
40
+ #define SYCL_PAD_BLOCK_SIZE 256
41
+ #define SYCL_ACC_BLOCK_SIZE 256
42
+ #define SYCL_IM2COL_BLOCK_SIZE 256
43
+ #define SYCL_POOL2D_BLOCK_SIZE 256
44
+
45
+ // dmmv = dequantize_mul_mat_vec
46
+ #ifndef GGML_SYCL_DMMV_X
47
+ #define GGML_SYCL_DMMV_X 32
48
+ #endif
49
+ #ifndef GGML_SYCL_MMV_Y
50
+ #define GGML_SYCL_MMV_Y 1
51
+ #endif
52
+
53
+ #ifndef K_QUANTS_PER_ITERATION
54
+ #define K_QUANTS_PER_ITERATION 2
55
+ #else
56
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
57
+ #endif
58
+
59
+ #ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE
60
+ #define GGML_SYCL_PEER_MAX_BATCH_SIZE 128
61
+ #endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
62
+
63
+ #define MUL_MAT_SRC1_COL_STRIDE 128
64
+
65
+ #define QK_WARP_SIZE 32
66
+ #endif // GGML_SYCL_PRESETS_HPP