@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -19,7 +19,11 @@ typedef half2 ggml_half2;
19
19
 
20
20
  #define GGML_COMMON_DECL
21
21
  #elif defined(GGML_COMMON_DECL_CUDA)
22
+ #if defined(GGML_COMMON_DECL_MUSA)
23
+ #include <musa_fp16.h>
24
+ #else
22
25
  #include <cuda_fp16.h>
26
+ #endif
23
27
  #include <cstdint>
24
28
 
25
29
  typedef half ggml_half;
@@ -106,28 +110,34 @@ typedef sycl::half2 ggml_half2;
106
110
  #define QR6_K 2
107
111
 
108
112
  #define QI2_XXS (QK_K / (4*QR2_XXS))
109
- #define QR2_XXS 8
113
+ #define QR2_XXS 4
110
114
 
111
115
  #define QI2_XS (QK_K / (4*QR2_XS))
112
- #define QR2_XS 8
116
+ #define QR2_XS 4
113
117
 
114
118
  #define QI2_S (QK_K / (4*QR2_S))
115
- #define QR2_S 8
119
+ #define QR2_S 4
116
120
 
117
121
  #define QI3_XXS (QK_K / (4*QR3_XXS))
118
- #define QR3_XXS 8
122
+ #define QR3_XXS 4
119
123
 
120
124
  #define QI3_XS (QK_K / (4*QR3_XS))
121
- #define QR3_XS 8
125
+ #define QR3_XS 4
122
126
 
123
127
  #define QI1_S (QK_K / (4*QR1_S))
124
128
  #define QR1_S 8
125
129
 
130
+ #define QI1_M (QK_K / (4*QR1_M))
131
+ #define QR1_M 8
132
+
126
133
  #define QI4_NL (QK4_NL / (4*QR4_NL))
127
134
  #define QR4_NL 2
128
135
 
129
136
  #define QI4_XS (QK_K / (4*QR4_XS))
130
- #define QR4_XS 8
137
+ #define QR4_XS 2
138
+
139
+ #define QI3_S (QK_K / (4*QR3_S))
140
+ #define QR3_S 4
131
141
 
132
142
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
133
143
 
@@ -193,6 +203,30 @@ typedef struct {
193
203
  } block_q8_1;
194
204
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
195
205
 
206
+ typedef struct {
207
+ ggml_half d[4]; // deltas for 4 q4_0 blocks
208
+ uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
209
+ } block_q4_0x4;
210
+ static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
211
+
212
+ typedef struct {
213
+ ggml_half d[8]; // deltas for 8 q4_0 blocks
214
+ uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
215
+ } block_q4_0x8;
216
+ static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
217
+
218
+ typedef struct {
219
+ ggml_half d[4]; // deltas for 4 q8_0 blocks
220
+ int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
221
+ } block_q8_0x4;
222
+ static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
223
+
224
+ typedef struct {
225
+ ggml_half d[8]; // deltas for 8 q8_0 blocks
226
+ int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
227
+ } block_q8_0x8;
228
+ static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
229
+
196
230
  //
197
231
  // Super-block quantization structures
198
232
  //
@@ -385,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
385
419
  #define GGML_TABLE_END() };
386
420
 
387
421
  #define GGML_COMMON_IMPL
388
- #elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
422
+ #elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
389
423
  #include <cstdint>
390
424
 
391
425
  #define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
@@ -17,7 +17,7 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
- #if defined(_WIN32)
20
+ #if defined(_MSC_VER)
21
21
 
22
22
  #define m512bh(p) p
23
23
  #define m512i(p) p
@@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
609
609
 
610
610
  #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
611
611
 
612
+ #ifdef __ARM_FEATURE_SVE
613
+ #include <arm_sve.h>
614
+ #endif // __ARM_FEATURE_SVE
615
+
612
616
  // precomputed f32 table for f16 (256 KB)
613
617
  // defined in ggml.c, initialized in ggml_init()
614
618
  extern float ggml_table_f32_f16[1 << 16];
@@ -630,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
630
634
  #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
631
635
  #endif
632
636
 
633
- #define GGML_HASHTABLE_FULL ((size_t)-1)
634
- #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
637
+ // bitset
638
+
639
+ static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
640
+ #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
641
+ #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
642
+
643
+ static size_t ggml_bitset_size(size_t n) {
644
+ return (n + BITSET_MASK) >> BITSET_SHR;
645
+ }
646
+
647
+ static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
648
+ return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
649
+ }
650
+
651
+ static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
652
+ bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
653
+ }
654
+
655
+ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
656
+ bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
657
+ }
658
+
659
+ // hash set
660
+
661
+ #define GGML_HASHSET_FULL ((size_t)-1)
662
+ #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
635
663
 
636
664
  struct ggml_hash_set ggml_hash_set_new(size_t size);
665
+ void ggml_hash_set_free(struct ggml_hash_set * hash_set);
637
666
 
638
- bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
667
+ // returns the minimum size for a hash set that can hold min_sz elements
668
+ size_t ggml_hash_size(size_t min_sz);
639
669
 
640
- // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
641
- size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
670
+ // remove all elements from the hash set
671
+ void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
642
672
 
643
- // returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
644
- size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
673
+ // returns true if key is in the hash set
674
+ static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
675
+
676
+ // returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
677
+ static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
678
+
679
+ // returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
680
+ static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
645
681
 
646
682
  // return index, asserts if table is full
647
- size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
683
+ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
684
+
685
+ // hash function for ggml_tensor
686
+ static inline size_t ggml_hash(const struct ggml_tensor * p) {
687
+ // the last 4 bits are always zero due to alignment
688
+ return (size_t)(uintptr_t)p >> 4;
689
+ }
690
+
691
+ static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
692
+ size_t h = ggml_hash(key) % hash_set->size;
693
+
694
+ // linear probing
695
+ size_t i = h;
696
+ while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
697
+ i = (i + 1) % hash_set->size;
698
+ if (i == h) {
699
+ // visited all hash table entries -> not found
700
+ return GGML_HASHSET_FULL;
701
+ }
702
+ }
703
+ return i;
704
+ }
705
+
706
+ static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
707
+ size_t i = ggml_hash_find(hash_set, key);
708
+ return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
709
+ }
710
+
711
+ static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
712
+ size_t h = ggml_hash(key) % hash_set->size;
713
+
714
+ // linear probing
715
+ size_t i = h;
716
+ do {
717
+ if (!ggml_bitset_get(hash_set->used, i)) {
718
+ ggml_bitset_set(hash_set->used, i);
719
+ hash_set->keys[i] = key;
720
+ return i;
721
+ }
722
+ if (hash_set->keys[i] == key) {
723
+ return GGML_HASHSET_ALREADY_EXISTS;
724
+ }
725
+ i = (i + 1) % hash_set->size;
726
+ } while (i != h);
727
+
728
+ // visited all hash table entries -> not found
729
+ GGML_ABORT("fatal error");
730
+ }
731
+
732
+ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
733
+ size_t h = ggml_hash(key) % hash_set->size;
734
+
735
+ // linear probing
736
+ size_t i = h;
737
+ do {
738
+ if (!ggml_bitset_get(hash_set->used, i)) {
739
+ ggml_bitset_set(hash_set->used, i);
740
+ hash_set->keys[i] = key;
741
+ return i;
742
+ }
743
+ if (hash_set->keys[i] == key) {
744
+ return i;
745
+ }
746
+ i = (i + 1) % hash_set->size;
747
+ } while (i != h);
748
+
749
+ // visited all hash table entries -> not found
750
+ GGML_ABORT("fatal error");
751
+ }
648
752
 
649
753
  #ifdef __cplusplus
650
754
  }
@@ -22,6 +22,7 @@
22
22
  #include "shaderop_mul_mat_q4_1.h"
23
23
  #include "shaderop_mul_mat_q6_k.h"
24
24
  #include "shaderop_mul_mat_mat_f32.h"
25
+ #include "shaderop_getrows_f32.h"
25
26
  #include "shaderop_getrows_f16.h"
26
27
  #include "shaderop_getrows_q4_0.h"
27
28
  #include "shaderop_getrows_q4_1.h"
@@ -565,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
565
566
  }
566
567
  if ((a % b) != 0) {
567
568
  fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
568
- GGML_ASSERT(!"safe_divide result would've had remainder");
569
+ GGML_ABORT("safe_divide result would've had remainder");
569
570
  }
570
571
  return a / b;
571
572
  }
@@ -1146,6 +1147,14 @@ static void ggml_vk_get_rows(
1146
1147
  seq.record<kp::OpAlgoDispatch>(s_algo);
1147
1148
  }
1148
1149
 
1150
+ template <typename... Args>
1151
+ static void ggml_vk_get_rows_f32(Args&&... args) {
1152
+ const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
1153
+ kp::shader_data::op_getrows_f32_comp_spv_len);
1154
+
1155
+ ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
1156
+ }
1157
+
1149
1158
  template <typename... Args>
1150
1159
  static void ggml_vk_get_rows_f16(Args&&... args) {
1151
1160
  const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
@@ -1183,7 +1192,7 @@ static void ggml_vk_rope(
1183
1192
  const std::shared_ptr<kp::Tensor>& inB,
1184
1193
  const std::shared_ptr<kp::Tensor>& out,
1185
1194
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1186
- ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx,
1195
+ ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
1187
1196
  float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
1188
1197
  int32_t ne01, int32_t ne02, int32_t ne03,
1189
1198
  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
@@ -1212,14 +1221,14 @@ static void ggml_vk_rope(
1212
1221
 
1213
1222
  struct PushConstants {
1214
1223
  uint32_t inAOff, inBOff, outOff;
1215
- int32_t n_dims, mode, n_orig_ctx;
1224
+ int32_t n_dims, mode, n_ctx_orig;
1216
1225
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1217
1226
  uint32_t nb00, nb01, nb02, nb03;
1218
1227
  int32_t ne0;
1219
1228
  uint32_t nb0, nb1, nb2, nb3;
1220
1229
  } pushConsts {
1221
1230
  safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
1222
- n_dims, mode, n_orig_ctx,
1231
+ n_dims, mode, n_ctx_orig,
1223
1232
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
1224
1233
  nb00, nb01, nb02, nb03,
1225
1234
  ne0,
@@ -1331,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
1331
1340
  case GGML_UNARY_OP_RELU:
1332
1341
  case GGML_UNARY_OP_GELU:
1333
1342
  case GGML_UNARY_OP_SILU:
1334
- return true;
1343
+ return ggml_is_contiguous(op->src[0]);
1335
1344
  default:
1336
1345
  ;
1337
1346
  }
@@ -1371,6 +1380,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
1371
1380
  return op->ne[3] == 1;
1372
1381
  case GGML_OP_GET_ROWS:
1373
1382
  switch (op->src[0]->type) {
1383
+ case GGML_TYPE_F32:
1374
1384
  case GGML_TYPE_F16:
1375
1385
  case GGML_TYPE_Q4_0:
1376
1386
  case GGML_TYPE_Q4_1:
@@ -1450,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1450
1460
 
1451
1461
  if (!ggml_vk_supports_op(dst)) {
1452
1462
  fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
1453
- GGML_ASSERT(!"unsupported op");
1463
+ GGML_ABORT("unsupported op");
1454
1464
  }
1455
1465
 
1456
1466
  const int32_t ne00 = src0 ? src0->ne[0] : 0;
@@ -1552,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1552
1562
  default:
1553
1563
  {
1554
1564
  fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1555
- GGML_ASSERT(false);
1565
+ GGML_ABORT("fatal error");
1556
1566
  }
1557
1567
  }
1558
1568
  } break;
@@ -1597,7 +1607,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1597
1607
  {
1598
1608
  GGML_ASSERT(ne00 == ne10);
1599
1609
 
1600
- // TODO: assert that dim2 and dim3 are contiguous
1601
1610
  GGML_ASSERT(ne12 % ne02 == 0);
1602
1611
  GGML_ASSERT(ne13 % ne03 == 0);
1603
1612
 
@@ -1662,7 +1671,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1662
1671
  } break;
1663
1672
  case GGML_OP_GET_ROWS:
1664
1673
  {
1665
- if (src0t == GGML_TYPE_F16) {
1674
+ if (src0t == GGML_TYPE_F32) {
1675
+ ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1676
+ } else if (src0t == GGML_TYPE_F16) {
1666
1677
  ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1667
1678
  } else if (src0t == GGML_TYPE_Q4_0) {
1668
1679
  ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
@@ -1681,13 +1692,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1681
1692
  #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1682
1693
  GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1683
1694
 
1695
+ #pragma message("TODO: update rope NORM mode to match NEOX mode")
1696
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
1697
+
1684
1698
  GGML_ASSERT(ne10 == ne02);
1685
1699
  GGML_ASSERT(src0t == dstt);
1686
1700
  // const int n_past = ((int32_t *) dst->op_params)[0];
1687
1701
  const int n_dims = ((int32_t *) dst->op_params)[1];
1688
1702
  const int mode = ((int32_t *) dst->op_params)[2];
1689
1703
  // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
1690
- const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1704
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
1691
1705
 
1692
1706
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1693
1707
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
@@ -1697,7 +1711,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1697
1711
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1698
1712
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1699
1713
  ggml_vk_rope(
1700
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx,
1714
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
1701
1715
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
1702
1716
  ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
1703
1717
  );
@@ -1731,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1731
1745
  continue;
1732
1746
  not_implemented: {}
1733
1747
  fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1734
- //GGML_ASSERT(false);
1748
+ //GGML_ABORT("fatal error");
1735
1749
  }
1736
1750
 
1737
1751
  // Evaluate sequence
@@ -1888,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
1888
1902
  return ctx->max_alloc;
1889
1903
  }
1890
1904
 
1891
- static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
1892
- GGML_UNUSED(buft);
1893
- return ggml_backend_is_kompute(backend);
1894
- }
1895
-
1896
1905
  static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1897
1906
  /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1898
1907
  /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1899
1908
  /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1900
1909
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1901
1910
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1902
- /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
1903
1911
  /* .is_host = */ NULL,
1904
1912
  };
1905
1913
 
@@ -1959,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
1959
1967
  return ggml_vk_supports_op(op);
1960
1968
  }
1961
1969
 
1970
+ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1971
+ GGML_UNUSED(backend);
1972
+ return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
1973
+ }
1974
+
1962
1975
  static struct ggml_backend_i kompute_backend_i = {
1963
1976
  /* .get_name = */ ggml_backend_kompute_name,
1964
1977
  /* .free = */ ggml_backend_kompute_free,
@@ -1969,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
1969
1982
  /* .synchronize = */ NULL,
1970
1983
  /* .graph_plan_create = */ NULL,
1971
1984
  /* .graph_plan_free = */ NULL,
1985
+ /* .graph_plan_update = */ NULL,
1972
1986
  /* .graph_plan_compute = */ NULL,
1973
1987
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1974
1988
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1989
+ /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1975
1990
  /* .offload_op = */ NULL,
1976
1991
  /* .event_new = */ NULL,
1977
1992
  /* .event_free = */ NULL,