@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,6 +1,7 @@
1
1
  // Unit tests for quantization specific functions - quantize, dequantize and dot product
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-cpu.h"
4
5
 
5
6
  #undef NDEBUG
6
7
  #include <assert.h>
@@ -44,26 +45,27 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
44
45
  }
45
46
 
46
47
  // Total quantization error on test data
47
- static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
48
+ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
48
49
  std::vector<uint8_t> tmp_q(2*test_size);
49
50
  std::vector<float> tmp_out(test_size);
50
51
 
51
- qfns.from_float(test_data, tmp_q.data(), test_size);
52
- qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
52
+ qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
53
+ qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
53
54
  return array_rmse(test_data, tmp_out.data(), test_size);
54
55
  }
55
56
 
56
57
  // Total quantization error on test data
57
- static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
58
+ static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
58
59
  std::vector<uint8_t> tmp_q(2*test_size);
59
60
  std::vector<float> tmp_out(test_size);
60
61
  std::vector<float> tmp_out_ref(test_size);
61
62
 
62
- qfns.from_float(test_data, tmp_q.data(), test_size);
63
- qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
63
+ // FIXME: why is done twice?
64
+ qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
65
+ qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
64
66
 
65
- qfns.from_float_ref(test_data, tmp_q.data(), test_size);
66
- qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
67
+ qfns->from_float_ref(test_data, tmp_q.data(), test_size);
68
+ qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
67
69
 
68
70
  return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
69
71
  }
@@ -77,19 +79,19 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
77
79
  }
78
80
 
79
81
  // Total dot product error
80
- static float dot_product_error(
81
- ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
82
- ) {
82
+ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
83
+ GGML_UNUSED(qfns);
84
+
83
85
  std::vector<uint8_t> tmp_q1(2*test_size);
84
86
  std::vector<uint8_t> tmp_q2(2*test_size);
85
87
 
86
- auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
88
+ const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
87
89
 
88
- qfns.from_float(test_data1, tmp_q1.data(), test_size);
89
- vdot.from_float(test_data2, tmp_q2.data(), test_size);
90
+ qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
91
+ vdot->from_float(test_data2, tmp_q2.data(), test_size);
90
92
 
91
93
  float result = INFINITY;
92
- qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
94
+ qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
93
95
 
94
96
  const float dot_ref = dot_product(test_data1, test_data2, test_size);
95
97
 
@@ -131,10 +133,11 @@ int main(int argc, char * argv[]) {
131
133
 
132
134
  for (int i = 0; i < GGML_TYPE_COUNT; i++) {
133
135
  ggml_type type = (ggml_type) i;
134
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
136
+ const auto * qfns = ggml_get_type_traits(type);
137
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
135
138
 
136
139
  // deprecated - skip
137
- if (qfns.blck_size == 0) {
140
+ if (qfns->blck_size == 0) {
138
141
  continue;
139
142
  }
140
143
 
@@ -143,8 +146,8 @@ int main(int argc, char * argv[]) {
143
146
  printf("Testing %s\n", ggml_type_name((ggml_type) i));
144
147
  ggml_quantize_init(ei);
145
148
 
146
- if (qfns.from_float && qfns.to_float) {
147
- const float total_error = total_quantization_error(qfns, test_size, test_data.data());
149
+ if (qfns_cpu->from_float && qfns->to_float) {
150
+ const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
148
151
  const float max_quantization_error =
149
152
  type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
150
153
  type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
@@ -159,14 +162,14 @@ int main(int argc, char * argv[]) {
159
162
  printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
160
163
  }
161
164
 
162
- const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
165
+ const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
163
166
  failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
164
167
  num_failed += failed;
165
168
  if (failed || verbose) {
166
169
  printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
167
170
  }
168
171
 
169
- const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
172
+ const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
170
173
  const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
171
174
  type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
172
175
  ? MAX_DOT_PRODUCT_ERROR_LOWBIT
@@ -1,12 +1,12 @@
1
1
  // Benchmark quantization specific functions on synthetic data
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-cpu.h"
4
5
 
5
6
  #undef NDEBUG
6
7
  #include <algorithm>
7
8
  #include <assert.h>
8
9
  #include <functional>
9
- #include <inttypes.h>
10
10
  #include <math.h>
11
11
  #include <memory>
12
12
  #include <stdio.h>
@@ -122,9 +122,10 @@ static void usage(char * argv[]) {
122
122
  printf(" --type TYPE set test type as");
123
123
  for (int i = 0; i < GGML_TYPE_COUNT; i++) {
124
124
  ggml_type type = (ggml_type) i;
125
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
125
+ const auto * qfns = ggml_get_type_traits(type);
126
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
126
127
  if (ggml_type_name(type) != NULL) {
127
- if (qfns.from_float && qfns.to_float) {
128
+ if (qfns_cpu->from_float && qfns->to_float) {
128
129
  printf(" %s", ggml_type_name(type));
129
130
  }
130
131
  }
@@ -270,12 +271,13 @@ int main(int argc, char * argv[]) {
270
271
 
271
272
  for (int i = 0; i < GGML_TYPE_COUNT; i++) {
272
273
  ggml_type type = (ggml_type) i;
273
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
274
+ const auto * qfns = ggml_get_type_traits(type);
275
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
274
276
  if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
275
277
  continue;
276
278
  }
277
279
 
278
- if (qfns.from_float && qfns.to_float) {
280
+ if (qfns_cpu->from_float && qfns->to_float) {
279
281
  printf("%s\n", ggml_type_name(type));
280
282
 
281
283
  ggml_quantize_init(type);
@@ -285,7 +287,7 @@ int main(int argc, char * argv[]) {
285
287
  for (size_t size : params.test_sizes) {
286
288
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
287
289
  auto quantize_fn = [&](void) -> float {
288
- qfns.from_float_ref(test_data1, test_q1, size);
290
+ qfns->from_float_ref(test_data1, test_q1, size);
289
291
  return test_q1[0];
290
292
  };
291
293
  size_t quantized_size = ggml_row_size(type, size);
@@ -299,7 +301,7 @@ int main(int argc, char * argv[]) {
299
301
  for (size_t size : params.test_sizes) {
300
302
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
301
303
  auto quantize_fn = [&](void) -> float {
302
- qfns.from_float(test_data1, test_q1, size);
304
+ qfns_cpu->from_float(test_data1, test_q1, size);
303
305
  return test_q1[0];
304
306
  };
305
307
  size_t quantized_size = ggml_row_size(type, size);
@@ -310,11 +312,11 @@ int main(int argc, char * argv[]) {
310
312
 
311
313
  if (params.op_dequantize_row_q) {
312
314
  printf(" dequantize_row_q\n");
313
- qfns.from_float(test_data1, test_q1, largest);
315
+ qfns_cpu->from_float(test_data1, test_q1, largest);
314
316
  for (size_t size : params.test_sizes) {
315
317
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
316
318
  auto quantize_fn = [&](void) -> float {
317
- qfns.to_float(test_q1, test_out, size);
319
+ qfns->to_float(test_q1, test_out, size);
318
320
  return test_out[0];
319
321
  };
320
322
  size_t quantized_size = ggml_row_size(type, size);
@@ -328,8 +330,8 @@ int main(int argc, char * argv[]) {
328
330
  for (size_t size : params.test_sizes) {
329
331
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
330
332
  auto quantize_fn = [&](void) -> float {
331
- auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
332
- vdot.from_float(test_data1, test_q1, size);
333
+ const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
334
+ vdot->from_float(test_data1, test_q1, size);
333
335
  return test_q1[0];
334
336
  };
335
337
  size_t quantized_size = ggml_row_size(type, size);
@@ -340,13 +342,13 @@ int main(int argc, char * argv[]) {
340
342
 
341
343
  if (params.op_vec_dot_q) {
342
344
  printf(" vec_dot_q\n");
343
- qfns.from_float(test_data1, test_q1, largest);
344
- qfns.from_float(test_data2, test_q2, largest);
345
+ qfns_cpu->from_float(test_data1, test_q1, largest);
346
+ qfns_cpu->from_float(test_data2, test_q2, largest);
345
347
  for (size_t size : params.test_sizes) {
346
348
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
347
349
  auto quantize_fn = [&](void) -> float {
348
350
  float result;
349
- qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
351
+ qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
350
352
  return result;
351
353
  };
352
354
  size_t quantized_size = ggml_row_size(type, size);
@@ -1,4 +1,5 @@
1
1
  #include "ggml.h"
2
+ #include "ggml-cpu.h"
2
3
 
3
4
  #include <cmath>
4
5
  #include <cstdio>
@@ -137,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
137
138
  struct ggml_tensor * x;
138
139
 
139
140
  // rope f32
140
- for (int m = 0; m < 3; ++m) {
141
+ for (int m = 0; m < 5; ++m) {
141
142
  const int ndims = 4;
142
143
 
143
144
  const int64_t n_rot = 128;
@@ -146,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
146
147
  const int n_past_0 = 100;
147
148
  const int n_past_2 = 33;
148
149
 
149
- struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
150
- struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
151
- struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
152
-
153
- for (int i = 0; i < ne[2]; ++i) {
154
- ((int32_t *) p0->data)[i] = n_past_0 + i;
155
- ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
156
- ((int32_t *) p2->data)[i] = n_past_2 + i;
157
- }
158
-
159
- // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
160
- const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
161
-
150
+ struct ggml_tensor * r0;
151
+ struct ggml_tensor * r1;
152
+ struct ggml_tensor * r2;
162
153
  x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
154
+ int mode = -1;
163
155
 
164
- // 100, 101, 102, ..., 172
165
- struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
166
- // -67, -67, -67, ..., -67
167
- struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
156
+ if (m < 3) {
157
+ struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
158
+ struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
159
+ struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
168
160
 
169
- // 33, 34, 35, ..., 105
170
- struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
161
+ for (int i = 0; i < ne[2]; ++i) {
162
+ ((int32_t *) p0->data)[i] = n_past_0 + i;
163
+ ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
164
+ ((int32_t *) p2->data)[i] = n_past_2 + i;
165
+ }
166
+ // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
167
+ mode = m == 0 ? 0 : m == 1 ? 2 : 4;
168
+
169
+ // 100, 101, 102, ..., 172
170
+ r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
171
+ // -67, -67, -67, ..., -67
172
+ r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
173
+
174
+ // 33, 34, 35, ..., 105
175
+ r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
176
+ } else {
177
+ // testing multi-dimension rope position embedding mode
178
+ struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
179
+ struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
180
+ struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
181
+
182
+ int sections[4] = {16, 24, 24, 0};
183
+ mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
184
+
185
+ for (int i = 0; i < ne[2]; ++i) {
186
+ for (int j = 0; j < 4; ++j) {
187
+ ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
188
+ ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
189
+ ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
190
+ }
191
+ }
192
+
193
+ // [[100, 101, 102, ..., 172],
194
+ // [101, 102, 103, ..., 173],
195
+ // [102, 103, 104, ..., 174]]
196
+ r0 = ggml_rope_multi(
197
+ ctx0, x, p0, nullptr,
198
+ n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
199
+ // [[-67, -67, -67, ..., -67]
200
+ // [-67, -67, -67, ..., -67]
201
+ // [-67, -67, -67, ..., -67]]
202
+ r1 = ggml_rope_multi(
203
+ ctx0, r0, p1, nullptr,
204
+ n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
205
+
206
+ // [[33, 34, 35, ..., 105]
207
+ // [34, 35, 36, ..., 106]
208
+ // [35, 36, 37, ..., 107]]
209
+ r2 = ggml_rope_multi(
210
+ ctx0, x, p2, nullptr,
211
+ n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
212
+ }
171
213
 
172
214
  ggml_cgraph * gf = ggml_new_graph(ctx0);
173
215