@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -1,7 +1,6 @@
1
1
  #include <ggml.h>
2
2
  #include <ggml-alloc.h>
3
3
  #include <ggml-backend.h>
4
- #include <ggml-backend-impl.h>
5
4
 
6
5
  #include <algorithm>
7
6
  #include <array>
@@ -80,14 +79,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
80
79
  im = nullptr;
81
80
  }
82
81
  }
82
+
83
83
  ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
84
84
  GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
85
+ // TODO: other cases
86
+ //#pragma omp parallel for
87
+ //for (int i = 0; i < tensor->ne[1]; i++) {
88
+ // ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
89
+ // i * tensor->ne[0], 1, tensor->ne[0], im);
90
+ //}
91
+
85
92
  ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
86
93
  } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
87
94
  // This is going to create some weird integers though.
88
95
  ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
89
96
  } else {
90
- GGML_ASSERT(false);
97
+ GGML_ABORT("fatal error");
91
98
  }
92
99
  }
93
100
 
@@ -125,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
125
132
  tt.to_float(&buf[i], vq.data(), bs);
126
133
  tv.insert(tv.end(), vq.begin(), vq.end());
127
134
  } else {
128
- GGML_ASSERT(false);
135
+ GGML_ABORT("fatal error");
129
136
  }
130
137
  }
131
138
  }
@@ -642,20 +649,29 @@ struct test_case {
642
649
  struct test_unary : public test_case {
643
650
  const ggml_unary_op op;
644
651
  const ggml_type type;
645
- const std::array<int64_t, 4> ne;
652
+ const std::array<int64_t, 4> ne_a;
653
+ int v; // view (1 : non-contiguous a)
646
654
 
647
655
  std::string vars() override {
648
- return VARS_TO_STR2(type, ne);
656
+ return VARS_TO_STR3(type, ne_a, v);
649
657
  }
650
658
 
651
659
  test_unary(ggml_unary_op op,
652
660
  ggml_type type = GGML_TYPE_F32,
653
- std::array<int64_t, 4> ne = {128, 10, 10, 10})
654
- : op(op), type(type), ne(ne) {}
661
+ std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
662
+ int v = 0)
663
+ : op(op), type(type), ne_a(ne_a), v(v) {}
655
664
 
656
665
  ggml_tensor * build_graph(ggml_context * ctx) override {
657
- ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
658
- ggml_tensor * out = ggml_unary(ctx, in, op);
666
+ ggml_tensor * a;
667
+ if (v & 1) {
668
+ auto ne = ne_a; ne[0] *= 3;
669
+ a = ggml_new_tensor(ctx, type, 4, ne.data());
670
+ a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
671
+ } else {
672
+ a = ggml_new_tensor(ctx, type, 4, ne_a.data());
673
+ }
674
+ ggml_tensor * out = ggml_unary(ctx, a, op);
659
675
  return out;
660
676
  }
661
677
 
@@ -751,7 +767,7 @@ struct test_dup : public test_case {
751
767
  }
752
768
 
753
769
  test_dup(ggml_type type = GGML_TYPE_F32,
754
- std::array<int64_t, 4> ne = {10, 10, 10, 1},
770
+ std::array<int64_t, 4> ne = {10, 10, 20, 1},
755
771
  std::array<int64_t, 4> permute = {0, 0, 0, 0})
756
772
  : type(type), ne(ne), permute(permute),
757
773
  _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
@@ -771,9 +787,15 @@ struct test_cpy : public test_case {
771
787
  const ggml_type type_src;
772
788
  const ggml_type type_dst;
773
789
  const std::array<int64_t, 4> ne;
790
+ const std::array<int64_t, 4> permute;
791
+ bool _src_use_permute;
774
792
 
775
793
  std::string vars() override {
776
- return VARS_TO_STR3(type_src, type_dst, ne);
794
+ return VARS_TO_STR4(type_src, type_dst, ne, permute);
795
+ }
796
+
797
+ double max_nmse_err() override {
798
+ return 1e-6;
777
799
  }
778
800
 
779
801
  size_t op_size(ggml_tensor * t) override {
@@ -781,12 +803,18 @@ struct test_cpy : public test_case {
781
803
  }
782
804
 
783
805
  test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
784
- std::array<int64_t, 4> ne = {10, 10, 10, 1})
785
- : type_src(type_src), type_dst(type_dst), ne(ne) {}
806
+ std::array<int64_t, 4> ne = {10, 10, 10, 1},
807
+ std::array<int64_t, 4> permute = {0, 0, 0, 0},
808
+ bool _dst_use_permute = false)
809
+ : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
810
+ _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
786
811
 
787
812
  ggml_tensor * build_graph(ggml_context * ctx) override {
788
813
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
789
- ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
814
+ if (_src_use_permute) {
815
+ src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
816
+ }
817
+ ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
790
818
  ggml_tensor * out = ggml_cpy(ctx, src, dst);
791
819
  return out;
792
820
  }
@@ -1054,6 +1082,33 @@ struct test_sqr : public test_case {
1054
1082
  }
1055
1083
  };
1056
1084
 
1085
+ // GGML_OP_SQRT
1086
+ struct test_sqrt : public test_case {
1087
+ const ggml_type type;
1088
+ const std::array<int64_t, 4> ne;
1089
+
1090
+ std::string vars() override {
1091
+ return VARS_TO_STR2(type, ne);
1092
+ }
1093
+
1094
+ test_sqrt(ggml_type type = GGML_TYPE_F32,
1095
+ std::array<int64_t, 4> ne = {10, 10, 10, 10})
1096
+ : type(type), ne(ne) {}
1097
+
1098
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1099
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1100
+ ggml_tensor * out = ggml_sqrt(ctx, a);
1101
+ return out;
1102
+ }
1103
+
1104
+ void initialize_tensors(ggml_context * ctx) override {
1105
+ // fill with positive values
1106
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1107
+ init_tensor_uniform(t, 0.0f, 100.0f);
1108
+ }
1109
+ }
1110
+ };
1111
+
1057
1112
  // GGML_OP_CLAMP
1058
1113
  struct test_clamp : public test_case {
1059
1114
  const ggml_type type;
@@ -1135,29 +1190,41 @@ struct test_soft_max : public test_case {
1135
1190
  }
1136
1191
  };
1137
1192
 
1193
+
1138
1194
  // GGML_OP_ROPE
1139
1195
  struct test_rope : public test_case {
1140
1196
  const ggml_type type;
1141
- const std::array<int64_t, 4> ne;
1197
+ const std::array<int64_t, 4> ne_a;
1142
1198
  int n_dims;
1143
1199
  int mode;
1144
- int n_ctx;
1200
+ int n_ctx; // used to generate positions
1201
+ float fs; // freq_scale
1202
+ float ef; // ext_factor
1203
+ float af; // attn_factor
1145
1204
  bool ff;
1205
+ int v; // view (1 : non-contiguous a)
1146
1206
 
1147
1207
  std::string vars() override {
1148
- return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff);
1208
+ return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
1149
1209
  }
1150
1210
 
1151
1211
  test_rope(ggml_type type = GGML_TYPE_F32,
1152
- std::array<int64_t, 4> ne = {10, 10, 10, 1},
1153
- int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
1154
- : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {}
1212
+ std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
1213
+ int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
1214
+ : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
1155
1215
 
1156
1216
  ggml_tensor * build_graph(ggml_context * ctx) override {
1157
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1158
- ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
1217
+ ggml_tensor * a;
1218
+ if (v & 1) {
1219
+ auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1220
+ a = ggml_new_tensor(ctx, type, 4, ne.data());
1221
+ a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
1222
+ } else {
1223
+ a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1224
+ }
1225
+ ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
1159
1226
  ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
1160
- ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
1227
+ ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
1161
1228
  return out;
1162
1229
  }
1163
1230
 
@@ -1165,11 +1232,11 @@ struct test_rope : public test_case {
1165
1232
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1166
1233
  if (t->type == GGML_TYPE_I32) {
1167
1234
  // pos
1168
- std::vector<int> data(ne[2]);
1169
- for (int i = 0; i < ne[2]; i++) {
1235
+ std::vector<int> data(ne_a[2]);
1236
+ for (int i = 0; i < ne_a[2]; i++) {
1170
1237
  data[i] = rand() % n_ctx;
1171
1238
  }
1172
- ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
1239
+ ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
1173
1240
  } else {
1174
1241
  if (t->ne[0] == n_dims/2) {
1175
1242
  // frequency factors in the range [0.9f, 1.1f]
@@ -1216,6 +1283,32 @@ struct test_pool2d : public test_case {
1216
1283
  }
1217
1284
  };
1218
1285
 
1286
+ // GGML_OP_CONV_TRANSPOSE_1D
1287
+ struct test_conv_transpose_1d : public test_case {
1288
+ const std::array<int64_t, 4> ne_input;
1289
+ const std::array<int64_t, 4> ne_kernel;
1290
+
1291
+ const int s0; // stride
1292
+ const int p0; // padding
1293
+ const int d0; // dilation
1294
+
1295
+ std::string vars() override {
1296
+ return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
1297
+ }
1298
+
1299
+ test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
1300
+ std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
1301
+ int s0 = 1, int p0 = 0, int d0 = 1)
1302
+ : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
1303
+
1304
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1305
+ ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
1306
+ ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
1307
+ ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
1308
+ return out;
1309
+ }
1310
+ };
1311
+
1219
1312
  // GGML_OP_IM2COL
1220
1313
  struct test_im2col : public test_case {
1221
1314
  const ggml_type type_input;
@@ -1229,7 +1322,7 @@ struct test_im2col : public test_case {
1229
1322
  // padding
1230
1323
  const int p0;
1231
1324
  const int p1;
1232
- // dilatation
1325
+ // dilation
1233
1326
  const int d0;
1234
1327
  const int d1;
1235
1328
  // mode
@@ -1262,22 +1355,37 @@ struct test_concat : public test_case {
1262
1355
  const std::array<int64_t, 4> ne_a;
1263
1356
  const int64_t ne_b_d;
1264
1357
  const int dim;
1358
+ const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
1265
1359
 
1266
1360
  std::string vars() override {
1267
- return VARS_TO_STR4(type, ne_a, ne_b_d, dim);
1361
+ return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
1268
1362
  }
1269
1363
 
1270
1364
  test_concat(ggml_type type = GGML_TYPE_F32,
1271
1365
  std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
1272
1366
  int64_t ne_b_d = 10,
1273
- int dim = 2)
1274
- : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {}
1367
+ int dim = 2, int v = 0)
1368
+ : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
1275
1369
 
1276
1370
  ggml_tensor * build_graph(ggml_context * ctx) override {
1277
1371
  auto ne_b = ne_a;
1278
1372
  ne_b[dim] = ne_b_d;
1279
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1280
- ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
1373
+ ggml_tensor * a;
1374
+ if (v & 1) {
1375
+ auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1376
+ a = ggml_new_tensor(ctx, type, 4, ne.data());
1377
+ a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
1378
+ } else {
1379
+ a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1380
+ }
1381
+ ggml_tensor * b;
1382
+ if (v & 2) {
1383
+ auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
1384
+ b = ggml_new_tensor(ctx, type, 4, ne.data());
1385
+ b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
1386
+ } else {
1387
+ b = ggml_new_tensor(ctx, type, 4, ne_b.data());
1388
+ }
1281
1389
  ggml_tensor * out = ggml_concat(ctx, a, b, dim);
1282
1390
  return out;
1283
1391
  }
@@ -1327,7 +1435,7 @@ struct test_argsort : public test_case {
1327
1435
  ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
1328
1436
  }
1329
1437
  } else {
1330
- GGML_ASSERT(false);
1438
+ GGML_ABORT("fatal error");
1331
1439
  }
1332
1440
  }
1333
1441
  }
@@ -1544,21 +1652,25 @@ struct test_flash_attn_ext : public test_case {
1544
1652
 
1545
1653
  const float max_bias; // ALiBi
1546
1654
 
1655
+ const ggml_type type_KV;
1656
+
1547
1657
  std::string vars() override {
1548
- return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias);
1658
+ return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
1549
1659
  }
1550
1660
 
1551
1661
  double max_nmse_err() override {
1552
1662
  return 5e-4;
1553
1663
  }
1554
1664
 
1555
- test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
1556
- : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}
1665
+ test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
1666
+ : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
1557
1667
 
1558
1668
  ggml_tensor * build_graph(ggml_context * ctx) override {
1559
- ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
1560
- ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
1561
- ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
1669
+ const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
1670
+
1671
+ ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
1672
+ ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
1673
+ ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
1562
1674
  ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
1563
1675
  ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
1564
1676
  return out;
@@ -1585,7 +1697,7 @@ struct llama_hparams {
1585
1697
 
1586
1698
  // cparams
1587
1699
  static constexpr uint32_t n_ctx = 512; // user-specified context size
1588
- static constexpr uint32_t n_orig_ctx = n_ctx;
1700
+ static constexpr uint32_t n_ctx_orig = n_ctx;
1589
1701
 
1590
1702
  // batch
1591
1703
  int32_t n_tokens;
@@ -1776,13 +1888,13 @@ struct test_llama : public test_llm {
1776
1888
 
1777
1889
  Qcur = ggml_rope_ext(
1778
1890
  ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr,
1779
- hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
1891
+ hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
1780
1892
  ext_factor, attn_factor, beta_fast, beta_slow
1781
1893
  );
1782
1894
 
1783
1895
  Kcur = ggml_rope_ext(
1784
1896
  ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
1785
- hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
1897
+ hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
1786
1898
  ext_factor, attn_factor, beta_fast, beta_slow
1787
1899
  );
1788
1900
 
@@ -1901,12 +2013,12 @@ struct test_falcon : public test_llm {
1901
2013
 
1902
2014
  // using mode = 2 for neox mode
1903
2015
  Qcur = ggml_rope_ext(
1904
- ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
2016
+ ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
1905
2017
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
1906
2018
  );
1907
2019
 
1908
2020
  Kcur = ggml_rope_ext(
1909
- ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
2021
+ ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
1910
2022
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
1911
2023
  );
1912
2024
 
@@ -1983,12 +2095,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
1983
2095
  GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
1984
2096
  GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
1985
2097
  GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2098
+ GGML_TYPE_BF16,
1986
2099
  };
1987
2100
 
1988
2101
  // unary ops
1989
- for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
1990
- test_cases.emplace_back(new test_unary((ggml_unary_op) op));
1991
- test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }));
2102
+ for (int v : {0, 1}) {
2103
+ for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
2104
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
2105
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
2106
+ }
1992
2107
  }
1993
2108
 
1994
2109
  test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
@@ -2026,6 +2141,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2026
2141
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
2027
2142
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
2028
2143
 
2144
+ test_cases.emplace_back(new test_conv_transpose_1d());
2145
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
2146
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
2147
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
2148
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
2149
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
2150
+ test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
2151
+ test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
2152
+
2153
+
2029
2154
  test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
2030
2155
  test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
2031
2156
  test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
@@ -2038,12 +2163,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2038
2163
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
2039
2164
  test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
2040
2165
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
2166
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
2167
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
2168
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
2169
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
2041
2170
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
2042
2171
  test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
2043
2172
 
2044
2173
  for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2045
2174
  for (ggml_type type_dst : all_types) {
2046
2175
  test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
2176
+ test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
2177
+ }
2178
+ }
2179
+ for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2180
+ for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2181
+ test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
2047
2182
  }
2048
2183
  }
2049
2184
 
@@ -2093,6 +2228,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2093
2228
  test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2094
2229
  }
2095
2230
 
2231
+ #if 1
2096
2232
  for (ggml_type type_a : base_types) {
2097
2233
  for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2098
2234
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
@@ -2112,6 +2248,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2112
2248
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
2113
2249
  }
2114
2250
  }
2251
+ #else
2252
+ // m = a rows
2253
+ // n = b rows
2254
+ // k = cols
2255
+ std::uniform_int_distribution<> dist_m(1, 128);
2256
+ std::uniform_int_distribution<> dist_n(16, 128);
2257
+ std::uniform_int_distribution<> dist_k(1, 16);
2258
+ for (int i = 0; i < 1000; i++) {
2259
+ for (ggml_type type_a : all_types) {
2260
+ for (ggml_type type_b : {GGML_TYPE_F32}) {
2261
+ int m = dist_m(rng);
2262
+ int n = dist_n(rng);
2263
+ int k = dist_k(rng) * ggml_blck_size(type_a);
2264
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
2265
+ }
2266
+ }
2267
+ }
2268
+ #endif
2115
2269
 
2116
2270
  for (ggml_type type_a : other_types) {
2117
2271
  for (ggml_type type_b : {GGML_TYPE_F32}) {
@@ -2159,6 +2313,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2159
2313
  }
2160
2314
 
2161
2315
  test_cases.emplace_back(new test_sqr());
2316
+ test_cases.emplace_back(new test_sqrt());
2162
2317
  test_cases.emplace_back(new test_clamp());
2163
2318
 
2164
2319
  test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
@@ -2174,7 +2329,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2174
2329
  for (int n = 0; n < 10; ++n) {
2175
2330
  int64_t ne0 = dist_ne0(rng);
2176
2331
  int64_t ne1 = dist_ne1(rng);
2177
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
2332
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
2178
2333
  }
2179
2334
 
2180
2335
  exponent <<= 1;
@@ -2193,31 +2348,52 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2193
2348
  }
2194
2349
  }
2195
2350
  }
2196
-
2351
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
2197
2352
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
2198
2353
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
2199
2354
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
2200
2355
 
2201
- for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2202
- // TODO: ff not supported yet for !neox
2203
- test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, false)); // llama 7B
2204
- test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, false)); // llama 13B
2205
- test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, false)); // llama 30B
2206
- test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, false)); // llama 65B
2356
+ {
2357
+ bool all = true;
2358
+
2359
+ for (float v : { 0, 1 }) {
2360
+ for (float fs : { 1.0f, 1.4245f }) {
2361
+ for (float ef : { 0.0f, 0.7465f }) {
2362
+ for (float af : { 1.0f, 1.4245f }) {
2363
+ for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2364
+ for (bool ff : {false, true}) { // freq_factors
2365
+ test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
2366
+
2367
+ if (all) {
2368
+ test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
2369
+ test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
2370
+ test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
2371
+ }
2372
+
2373
+ if (all) {
2374
+ test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2375
+ test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2376
+ test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2377
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
2378
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
2379
+ }
2380
+
2381
+ test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2382
+ }
2383
+ }
2207
2384
 
2208
- for (bool ff : {false, true}) { // freq_factors
2209
- test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
2210
- test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
2211
- test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
2212
- test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
2213
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, ff)); // neox (stablelm)
2214
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, ff)); // neox (phi-2)
2385
+ all = false;
2386
+ }
2387
+ }
2388
+ }
2215
2389
  }
2216
2390
  }
2217
2391
 
2218
- for (int dim : { 0, 1, 2, 3, }) {
2219
- test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim));
2220
- test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim));
2392
+ for (int v : { 0, 1, 2, 3 }) {
2393
+ for (int dim : { 0, 1, 2, 3, }) {
2394
+ test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
2395
+ test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
2396
+ }
2221
2397
  }
2222
2398
 
2223
2399
  for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
@@ -2244,7 +2420,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2244
2420
  for (int nh : { 32, }) {
2245
2421
  for (int kv : { 512, 1024, }) {
2246
2422
  for (int nb : { 1, 2, 4, 8, }) {
2247
- test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias));
2423
+ for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
2424
+ test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
2425
+ }
2248
2426
  }
2249
2427
  }
2250
2428
  }
@@ -2284,7 +2462,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2284
2462
  return true;
2285
2463
  }
2286
2464
 
2287
- GGML_ASSERT(false);
2465
+ GGML_ABORT("fatal error");
2288
2466
  return false;
2289
2467
  }
2290
2468