@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -10,6 +10,8 @@
10
10
  #include <string>
11
11
  #include <vector>
12
12
 
13
+ extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
14
+
13
15
  static void dump(const llama_token_data_array * cur_p) {
14
16
  for (size_t i = 0; i < cur_p->size; i++) {
15
17
  printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
@@ -18,181 +20,188 @@ static void dump(const llama_token_data_array * cur_p) {
18
20
 
19
21
  #define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
20
22
 
21
- #define APPLY(__cnstr, __cur_p) do { \
22
- auto * cnstr = (__cnstr); \
23
- llama_sampler_apply(cnstr, (__cur_p)); \
24
- llama_sampler_free(cnstr); \
25
- } while(0)
23
+ struct sampler_tester {
24
+ sampler_tester(size_t n_vocab) {
25
+ cur.reserve(n_vocab);
26
+ for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
27
+ const float logit = logf(token_id);
28
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
29
+ }
26
30
 
27
- static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
28
- const size_t n_vocab = probs.size();
31
+ cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
32
+ }
29
33
 
30
- std::vector<llama_token_data> cur;
31
- cur.reserve(n_vocab);
32
- for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
33
- const float logit = logf(probs[token_id]);
34
- cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
34
+ sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
35
+ cur.reserve(probs.size());
36
+ for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
37
+ const float logit = logf(probs[token_id]);
38
+ cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
39
+ }
40
+
41
+ cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
35
42
  }
36
43
 
37
- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
38
- APPLY(llama_sampler_init_softmax(), &cur_p);
39
- DUMP(&cur_p);
40
- APPLY(llama_sampler_init_top_k(k), &cur_p);
41
- DUMP(&cur_p);
42
-
43
- GGML_ASSERT(cur_p.size == expected_probs.size());
44
- for (size_t i = 0; i < cur_p.size; i++) {
45
- GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
44
+ void apply(llama_sampler * sampler) {
45
+ llama_sampler_apply(sampler, &cur_p);
46
+ llama_sampler_free(sampler);
46
47
  }
47
- }
48
48
 
49
- static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
50
- const size_t n_vocab = probs.size();
49
+ void check() {
50
+ GGML_ASSERT(cur_p.size == probs_expected.size());
51
+ for (size_t i = 0; i < cur_p.size; i++) {
52
+ GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
53
+ }
54
+ }
55
+
56
+ llama_token_data_array cur_p;
57
+
58
+ private:
59
+ const std::vector<float> probs_expected;
51
60
 
52
61
  std::vector<llama_token_data> cur;
53
- cur.reserve(n_vocab);
54
- for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
55
- const float logit = logf(probs[token_id]);
56
- cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
57
- }
62
+ };
58
63
 
59
- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
60
- APPLY(llama_sampler_init_softmax(), &cur_p);
61
- DUMP(&cur_p);
62
- APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
63
- DUMP(&cur_p);
64
-
65
- GGML_ASSERT(cur_p.size == expected_probs.size());
66
- for (size_t i = 0; i < cur_p.size; i++) {
67
- GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
68
- }
64
+ static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
65
+ sampler_tester tester(probs, probs_expected);
66
+
67
+ DUMP(&tester.cur_p);
68
+ tester.apply(llama_sampler_init_temp(temp));
69
+ tester.apply(llama_sampler_init_dist(0));
70
+ DUMP(&tester.cur_p);
71
+
72
+ tester.check();
69
73
  }
70
74
 
71
- static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
72
- const size_t n_vocab = probs.size();
75
+ static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
76
+ sampler_tester tester(probs, probs_expected);
73
77
 
74
- std::vector<llama_token_data> cur;
75
- cur.reserve(n_vocab);
76
- for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
77
- const float logit = logf(probs[token_id]);
78
- cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
79
- }
78
+ DUMP(&tester.cur_p);
79
+ tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
80
+ tester.apply(llama_sampler_init_dist (0));
81
+ DUMP(&tester.cur_p);
80
82
 
81
- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
82
- DUMP(&cur_p);
83
- APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
84
- DUMP(&cur_p);
83
+ tester.check();
84
+ }
85
85
 
86
- GGML_ASSERT(cur_p.size == expected_probs.size());
87
- for (size_t i = 0; i < cur_p.size; i++) {
88
- GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
89
- }
86
+ static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
87
+ sampler_tester tester(probs, probs_expected);
88
+
89
+ DUMP(&tester.cur_p);
90
+ tester.apply(llama_sampler_init_top_k(k));
91
+ tester.apply(llama_sampler_init_dist (0));
92
+ DUMP(&tester.cur_p);
93
+
94
+ tester.check();
90
95
  }
91
96
 
92
- static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
93
- const size_t n_vocab = probs.size();
97
+ static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
98
+ sampler_tester tester(probs, probs_expected);
94
99
 
95
- std::vector<llama_token_data> cur;
96
- cur.reserve(n_vocab);
97
- for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
98
- const float logit = logf(probs[token_id]);
99
- cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
100
- }
100
+ DUMP(&tester.cur_p);
101
+ tester.apply(llama_sampler_init_top_p(p, 1));
102
+ tester.apply(llama_sampler_init_dist (0));
103
+ DUMP(&tester.cur_p);
101
104
 
102
- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
103
- DUMP(&cur_p);
104
- APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
105
- DUMP(&cur_p);
106
- APPLY(llama_sampler_init_softmax(), &cur_p);
107
-
108
- GGML_ASSERT(cur_p.size == expected_probs.size());
109
- for (size_t i = 0; i < cur_p.size; i++) {
110
- GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
111
- }
105
+ tester.check();
112
106
  }
113
107
 
114
- static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
115
- const size_t n_vocab = probs.size();
108
+ static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
109
+ sampler_tester tester(probs, probs_expected);
116
110
 
117
- std::vector<llama_token_data> cur;
118
- cur.reserve(n_vocab);
119
- for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
120
- const float logit = logf(probs[token_id]);
121
- cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
122
- }
111
+ DUMP(&tester.cur_p);
112
+ tester.apply(llama_sampler_init_min_p(p, 1));
113
+ tester.apply(llama_sampler_init_dist (0));
114
+ DUMP(&tester.cur_p);
123
115
 
124
- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
125
- DUMP(&cur_p);
126
- APPLY(llama_sampler_init_typical(p, 1), &cur_p);
127
- DUMP(&cur_p);
116
+ tester.check();
117
+ }
128
118
 
129
- GGML_ASSERT(cur_p.size == expected_probs.size());
130
- for (size_t i = 0; i < cur_p.size; i++) {
131
- GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
132
- }
119
+ static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
120
+ sampler_tester tester(probs, probs_expected);
121
+
122
+ DUMP(&tester.cur_p);
123
+ tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
124
+ DUMP(&tester.cur_p);
125
+
126
+ tester.check();
127
+ }
128
+
129
+ static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
130
+ sampler_tester tester(probs, probs_expected);
131
+
132
+ DUMP(&tester.cur_p);
133
+ tester.apply(llama_sampler_init_typical(p, 1));
134
+ DUMP(&tester.cur_p);
135
+
136
+ tester.check();
133
137
  }
134
138
 
135
139
  static void test_penalties(
136
140
  const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
137
- const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
141
+ const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
138
142
  ) {
139
- GGML_ASSERT(probs.size() == expected_probs.size());
143
+ GGML_ASSERT(probs.size() == probs_expected.size());
144
+
145
+ sampler_tester tester(probs, probs_expected);
140
146
 
141
147
  const size_t n_vocab = probs.size();
148
+ auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
142
149
 
143
- std::vector<llama_token_data> cur;
144
- cur.reserve(n_vocab);
145
- for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
146
- const float logit = logf(probs[token_id]);
147
- cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
150
+ for (size_t i = 0; i < last_tokens.size(); i++) {
151
+ llama_sampler_accept(sampler, last_tokens[i]);
148
152
  }
149
153
 
150
- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
154
+ DUMP(&tester.cur_p);
155
+ tester.apply(sampler);
156
+ tester.apply(llama_sampler_init_dist(0));
157
+ DUMP(&tester.cur_p);
158
+
159
+ tester.check();
160
+ }
151
161
 
152
- auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
162
+ static void test_dry(
163
+ const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
164
+ const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
165
+ int dry_allowed_length, int dry_penalty_last_n,
166
+ const std::vector<std::vector<llama_token>> & seq_breakers
167
+ ) {
168
+ GGML_ASSERT(probs.size() == expected_probs.size());
169
+
170
+ sampler_tester tester(probs, expected_probs);
171
+
172
+ auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
153
173
 
154
174
  for (size_t i = 0; i < last_tokens.size(); i++) {
155
175
  llama_sampler_accept(sampler, last_tokens[i]);
156
176
  }
157
177
 
158
- APPLY(llama_sampler_init_softmax(), &cur_p);
159
- DUMP(&cur_p);
160
- APPLY(sampler, &cur_p);
161
- APPLY(llama_sampler_init_softmax(), &cur_p);
162
- DUMP(&cur_p);
163
-
164
- GGML_ASSERT(cur_p.size == expected_probs.size());
165
- for (size_t i = 0; i < cur_p.size; i++) {
166
- GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
167
- }
178
+ DUMP(&tester.cur_p);
179
+ tester.apply(sampler);
180
+ tester.apply(llama_sampler_init_dist(0));
181
+ DUMP(&tester.cur_p);
182
+ tester.check();
168
183
  }
169
184
 
170
185
  static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
171
186
  ) {
172
- std::vector<llama_token_data> cur;
173
- cur.reserve(n_vocab);
174
- for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
175
- const float logit = logf(token_id);
176
- cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
177
- }
178
-
179
- llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
187
+ sampler_tester tester(n_vocab);
180
188
 
181
189
  llama_token min_token_id = 0;
182
190
  const llama_token max_token_id = n_vocab-1;
183
191
 
184
192
  for (auto s : samplers_sequence) {
185
193
  switch (s){
186
- case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
187
- case 'f': GGML_ABORT("tail_free test not implemented");
194
+ case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
188
195
  case 'y': GGML_ABORT("typical test not implemented");
189
- case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
190
- case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
196
+ case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
197
+ case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
191
198
  case 't': GGML_ABORT("temperature test not implemented");
192
199
  default : GGML_ABORT("Unknown sampler");
193
200
  }
194
201
 
195
- APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests
202
+ tester.apply(llama_sampler_init_dist(0));
203
+
204
+ auto & cur_p = tester.cur_p;
196
205
 
197
206
  const int size = cur_p.size;
198
207
 
@@ -263,7 +272,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
263
272
  }
264
273
  const int64_t t_end = ggml_time_us();
265
274
  llama_sampler_free(cnstr);
266
- printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
275
+ printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
267
276
  }
268
277
 
269
278
  #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
@@ -275,30 +284,35 @@ static void test_perf() {
275
284
 
276
285
  data.reserve(n_vocab);
277
286
  for (int i = 0; i < n_vocab; i++) {
278
- const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
287
+ const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
279
288
  data.emplace_back(llama_token_data{i, logit, 0.0f});
280
289
  }
281
290
 
282
- BENCH(llama_sampler_init_top_k (40), data, 32);
283
- BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
284
- BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
285
- BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
286
- BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
287
- BENCH(llama_sampler_init_softmax (), data, 32);
291
+ BENCH(llama_sampler_init_top_k (40), data, 32);
292
+ BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
293
+ BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
294
+ BENCH(llama_sampler_init_typical(0.5f, 1), data, 32);
295
+ BENCH(llama_sampler_init_xtc (1.0f, 0.1f, 1, 1), data, 32);
288
296
  }
289
297
 
290
298
  int main(void) {
291
299
  ggml_time_init();
292
300
 
293
- test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
294
- test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
301
+ test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
302
+ test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
303
+
304
+ test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
305
+ test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
306
+
307
+ test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
308
+ test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
295
309
  test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
296
310
  test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
297
311
 
298
- test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
299
- test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
300
- test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
301
- test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
312
+ test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
313
+ test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
314
+ test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
315
+ test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
302
316
 
303
317
  test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
304
318
  test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
@@ -309,9 +323,13 @@ int main(void) {
309
323
  test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
310
324
  test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
311
325
 
312
- test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
313
- test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
314
- test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
326
+ printf("XTC should:\n");
327
+ test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.1f}, 0.99f, 0.09f);
328
+ test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.2f, 0.1f}, 0.99f, 0.19f);
329
+ test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.3f, 0.2f, 0.1f}, 0.99f, 0.29f);
330
+
331
+ printf("XTC should not:\n");
332
+ test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.99f, 0.39f);
315
333
 
316
334
  test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
317
335
  test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
@@ -324,6 +342,13 @@ int main(void) {
324
342
  test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
325
343
  test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
326
344
 
345
+
346
+ test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
347
+ test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.296923f, 0.109232f}, 1.0f, 1.1f, 2, 5, {});
348
+ test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
349
+ test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.241818f, 0.241818f, 0.032727f}, 2.0f, 1.1f, 2, 5, {});
350
+ test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
351
+
327
352
  test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
328
353
  test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
329
354
  test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
@@ -202,7 +202,7 @@ int main(int argc, char **argv) {
202
202
  for (int i = 0; i < nthread; i++) {
203
203
  threads[i] = std::thread([&, i]() {
204
204
  for (const auto & test_kv : k_tests) {
205
- const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
205
+ const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
206
206
 
207
207
  // here only print the result of the first thread
208
208
  // because the other threads are running the same tests
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
212
212
 
213
213
  printf("\n");
214
214
  printf("src: '%s'\n", test_kv.first.c_str());
215
- printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
215
+ printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
216
216
  printf("tok: ");
217
217
  for (const auto & tok : res) {
218
218
  printf("%d ", tok);
@@ -229,16 +229,16 @@ int main(int argc, char **argv) {
229
229
  if (!correct) {
230
230
  fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
231
231
  fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
232
- llama_detokenize(ctx, res).c_str(),
233
- llama_detokenize(ctx, test_kv.second).c_str());
232
+ common_detokenize(ctx, res).c_str(),
233
+ common_detokenize(ctx, test_kv.second).c_str());
234
234
  fprintf(stderr, "%s : expected tokens: ", __func__);
235
235
  for (const auto & t : test_kv.second) {
236
- fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
236
+ fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
237
237
  }
238
238
  fprintf(stderr, "\n");
239
239
  fprintf(stderr, "%s : got tokens: ", __func__);
240
240
  for (const auto & t : res) {
241
- fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
241
+ fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
242
242
  }
243
243
  fprintf(stderr, "\n");
244
244
 
@@ -273,7 +273,7 @@ int main(int argc, char **argv) {
273
273
  {
274
274
  const auto t_start = ggml_time_us();
275
275
 
276
- res = llama_tokenize(ctx, text, add_special, false);
276
+ res = common_tokenize(ctx, text, add_special, false);
277
277
 
278
278
  const auto t_end = ggml_time_us();
279
279
 
@@ -78,10 +78,10 @@ int main(int argc, char **argv) {
78
78
  const int n_vocab = llama_n_vocab(model);
79
79
 
80
80
  for (int i = 0; i < n_vocab; ++i) {
81
- std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
81
+ std::string str = common_detokenize(ctx, std::vector<int>(1, i));
82
82
  try {
83
83
  auto cps = unicode_cpts_from_utf8(str);
84
- std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
84
+ std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
85
85
  if (ignore_merges && tokens.size() > 1) {
86
86
  fprintf(stderr,
87
87
  "%s : error: token %d detokenizes to '%s'(%zu) but "
@@ -94,7 +94,7 @@ int main(int argc, char **argv) {
94
94
  fprintf(stderr, "]\n");
95
95
  return 2;
96
96
  }
97
- std::string check = llama_detokenize(ctx, tokens);
97
+ std::string check = common_detokenize(ctx, tokens);
98
98
  if (check != str) {
99
99
  fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
100
100
  __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -123,8 +123,8 @@ int main(int argc, char **argv) {
123
123
  }
124
124
 
125
125
  std::string str = unicode_cpt_to_utf8(cp);
126
- std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
127
- std::string check = llama_detokenize(ctx, tokens);
126
+ std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
127
+ std::string check = common_detokenize(ctx, tokens);
128
128
  if (cp != 9601 && str != check) {
129
129
  fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
130
130
  cp, check.c_str(), check.length(), str.c_str(), str.length());
@@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
66
66
  const int n_vocab = llama_n_vocab(model);
67
67
 
68
68
  for (int i = 0; i < n_vocab; ++i) {
69
- std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
70
- std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
71
- std::string check = llama_detokenize(ctx, tokens);
69
+ std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
70
+ std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
71
+ std::string check = common_detokenize(ctx, tokens);
72
72
  if (check != str) {
73
73
  fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
74
74
  __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
93
93
  }
94
94
 
95
95
  std::string str = unicode_cpt_to_utf8(cp);
96
- std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
97
- std::string check = llama_detokenize(ctx, tokens);
96
+ std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
97
+ std::string check = common_detokenize(ctx, tokens);
98
98
  if (cp != 9601 && str != check) {
99
99
  fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
100
100
  cp, check.c_str(), check.length(), str.c_str(), str.length());
@@ -1,72 +0,0 @@
1
- name: Nix aarch64 builds
2
-
3
- on:
4
- workflow_dispatch: # allows manual triggering
5
- schedule:
6
- # Rebuild daily rather than on every push because QEMU is expensive (e.g.
7
- # 1.5h instead of minutes with the cold cache).
8
- #
9
- # randint(0, 59), randint(0, 23)
10
- - cron: '26 12 * * *'
11
- # But also rebuild if we touched any of the Nix expressions:
12
- push:
13
- branches:
14
- - master
15
- paths: ['**/*.nix', 'flake.lock']
16
- pull_request:
17
- types: [opened, synchronize, reopened]
18
- paths: ['**/*.nix', 'flake.lock']
19
-
20
- concurrency:
21
- group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
22
- cancel-in-progress: true
23
-
24
- # Fine-grant permission
25
- # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
26
- permissions:
27
- # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
28
- id-token: write
29
- contents: read
30
-
31
- jobs:
32
- nix-build-aarch64:
33
- runs-on: ubuntu-latest
34
- steps:
35
- - name: Checkout repository
36
- uses: actions/checkout@v4
37
- - name: Install QEMU
38
- # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
39
- run: |
40
- sudo apt-get update
41
- sudo apt-get install -y qemu-user-static qemu-system-aarch64
42
- sudo usermod -a -G kvm $USER
43
- - name: Install Nix
44
- uses: DeterminateSystems/nix-installer-action@v9
45
- with:
46
- github-token: ${{ secrets.GITHUB_TOKEN }}
47
- extra-conf: |
48
- extra-platforms = aarch64-linux
49
- extra-system-features = nixos-test kvm
50
- extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
51
- extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
52
- - uses: DeterminateSystems/magic-nix-cache-action@v2
53
- with:
54
- upstream-cache: https://${{ matrix.cachixName }}.cachix.org
55
- - name: Set-up cachix to push the results to
56
- uses: cachix/cachix-action@v13
57
- with:
58
- authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
59
- name: llama-cpp
60
- - name: Show all output paths
61
- run: >
62
- nix run github:nix-community/nix-eval-jobs
63
- -- --gc-roots-dir gcroot
64
- --flake
65
- ".#packages.aarch64-linux"
66
- - name: Build
67
- run: >
68
- nix run github:Mic92/nix-fast-build
69
- -- --skip-cached --no-nom
70
- --systems aarch64-linux
71
- --flake
72
- ".#checks.aarch64-linux"