@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -34,55 +34,6 @@ struct results_log_softmax {
34
34
  float prob;
35
35
  };
36
36
 
37
- static void write_logfile(
38
- const llama_context * ctx, const gpt_params & params, const llama_model * model,
39
- const struct results_perplexity & results
40
- ) {
41
- if (params.logdir.empty()) {
42
- return;
43
- }
44
-
45
- if (params.hellaswag) {
46
- LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
47
- return;
48
- }
49
-
50
- const std::string timestamp = string_get_sortable_timestamp();
51
-
52
- const bool success = fs_create_directory_with_parents(params.logdir);
53
- if (!success) {
54
- LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
55
- __func__, params.logdir.c_str());
56
- return;
57
- }
58
-
59
- const std::string logfile_path = params.logdir + timestamp + ".yml";
60
- FILE * logfile = fopen(logfile_path.c_str(), "w");
61
-
62
- if (logfile == NULL) {
63
- LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
64
- return;
65
- }
66
-
67
- fprintf(logfile, "binary: main\n");
68
- char model_desc[128];
69
- llama_model_desc(model, model_desc, sizeof(model_desc));
70
- yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
71
-
72
- fprintf(logfile, "\n");
73
- fprintf(logfile, "######################\n");
74
- fprintf(logfile, "# Perplexity Results #\n");
75
- fprintf(logfile, "######################\n");
76
- fprintf(logfile, "\n");
77
-
78
- yaml_dump_vector_float(logfile, "logits", results.logits);
79
- fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
80
- yaml_dump_vector_float(logfile, "probs", results.probs);
81
-
82
- llama_perf_dump_yaml(logfile, ctx);
83
- fclose(logfile);
84
- }
85
-
86
37
  static std::vector<float> softmax(const std::vector<float>& logits) {
87
38
  std::vector<float> probs(logits.size());
88
39
  float max_logit = logits[0];
@@ -169,7 +120,7 @@ static void process_logits(
169
120
  break;
170
121
  }
171
122
  lock.unlock();
172
- const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
123
+ const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
173
124
  const double v = -results.log_softmax;
174
125
  local_nll += v;
175
126
  local_nll2 += v*v;
@@ -203,7 +154,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
203
154
  break;
204
155
  }
205
156
  lock.unlock();
206
- const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
157
+ const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
207
158
  local_nll += v;
208
159
  local_nll2 += v*v;
209
160
  }
@@ -281,7 +232,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
281
232
  kld.sum_kld += sum;
282
233
  kld.sum_kld2 += sum*sum;
283
234
  ++kld.count;
284
- if (imax == imax_base) ++kld.n_same_top;
235
+ if (imax == imax_base) {
236
+ ++kld.n_same_top;
237
+ }
285
238
 
286
239
  const float p_base = expf(-nll_base);
287
240
  const float p = expf(-nll);
@@ -323,7 +276,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
323
276
  break;
324
277
  }
325
278
  lock.unlock();
326
- std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
279
+ std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
327
280
  kld_values[i] = (float)v.first;
328
281
  p_diff_values[i] = v.second;
329
282
  }
@@ -337,7 +290,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
337
290
  }
338
291
  }
339
292
 
340
- static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
293
+ static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
341
294
  // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
342
295
  // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
343
296
  // Output: `perplexity: 13.5106 [114/114]`
@@ -348,7 +301,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
348
301
 
349
302
  LOG_INF("%s: tokenizing the input ..\n", __func__);
350
303
 
351
- std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
304
+ std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
352
305
 
353
306
  const int n_ctx = llama_n_ctx(ctx);
354
307
 
@@ -383,9 +336,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
383
336
  const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
384
337
 
385
338
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
386
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
387
339
  const int n_batch = params.n_batch;
388
340
 
341
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
342
+
389
343
  int count = 0;
390
344
  double nll = 0.0;
391
345
 
@@ -405,14 +359,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
405
359
  // clear the KV cache
406
360
  llama_kv_cache_clear(ctx);
407
361
 
362
+ llama_batch batch = llama_batch_init(n_batch, 0, 1);
363
+
408
364
  for (int j = 0; j < num_batches; ++j) {
409
365
  const int batch_start = start + j * n_batch;
410
366
  const int batch_size = std::min(end - batch_start, n_batch);
411
367
 
368
+ common_batch_clear(batch);
369
+ for (int i = 0; i < batch_size; i++) {
370
+ common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
371
+ }
372
+
412
373
  //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
413
- // TODO: use llama_batch.logits instead of relying on logits_all == true
414
- if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
374
+ if (llama_decode(ctx, batch)) {
415
375
  //LOG_ERR("%s : failed to eval\n", __func__);
376
+ llama_batch_free(batch);
416
377
  return {tokens, -1, logit_history, prob_history};
417
378
  }
418
379
 
@@ -424,14 +385,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
424
385
  tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
425
386
  }
426
387
 
427
- const auto batch_logits = llama_get_logits(ctx);
428
- logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
388
+ const auto * batch_logits = llama_get_logits(ctx);
389
+ logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
429
390
 
430
391
  if (j == 0) {
431
392
  tokens[batch_start] = token_org;
432
393
  }
433
394
  }
434
395
 
396
+ llama_batch_free(batch);
397
+
435
398
  const auto t_end = std::chrono::high_resolution_clock::now();
436
399
 
437
400
  if (i == 0) {
@@ -447,11 +410,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
447
410
 
448
411
  //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
449
412
  for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
450
-
451
413
  // Calculate probability of next token, given the previous ones.
452
414
  const std::vector<float> tok_logits(
453
- logits.begin() + (j + 0) * n_vocab,
454
- logits.begin() + (j + 1) * n_vocab);
415
+ logits.begin() + size_t(j + 0) * n_vocab,
416
+ logits.begin() + size_t(j + 1) * n_vocab);
455
417
 
456
418
  const float prob = softmax(tok_logits)[tokens[start + j + 1]];
457
419
  logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
@@ -472,7 +434,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
472
434
  return {tokens, std::exp(nll / count), logit_history, prob_history};
473
435
  }
474
436
 
475
- static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) {
437
+ static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
476
438
  if (params.ppl_stride > 0) {
477
439
  return perplexity_v2(ctx, params);
478
440
  }
@@ -500,7 +462,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
500
462
  auto tim1 = std::chrono::high_resolution_clock::now();
501
463
  LOG_INF("%s: tokenizing the input ..\n", __func__);
502
464
 
503
- std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
465
+ std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
504
466
 
505
467
  auto tim2 = std::chrono::high_resolution_clock::now();
506
468
  LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -521,9 +483,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
521
483
  const int n_chunk_max = tokens.size() / n_ctx;
522
484
 
523
485
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
524
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
525
486
  const int n_batch = params.n_batch;
526
487
 
488
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
489
+
527
490
  int count = 0;
528
491
  double nll = 0.0;
529
492
  double nll2 = 0.0;
@@ -538,7 +501,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
538
501
 
539
502
  std::vector<float> logits;
540
503
  if (num_batches > 1) {
541
- logits.reserve((size_t)n_ctx * n_vocab);
504
+ logits.reserve(size_t(n_ctx) * n_vocab);
542
505
  }
543
506
 
544
507
  LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
@@ -620,7 +583,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
620
583
 
621
584
  if (num_batches > 1 && n_outputs > 0) {
622
585
  const auto * batch_logits = llama_get_logits(ctx);
623
- logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
586
+ logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
624
587
  }
625
588
  }
626
589
 
@@ -661,7 +624,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
661
624
  } else {
662
625
  double av = nll/count;
663
626
  double av2 = nll2/count - av*av;
664
- if (av2 > 0) av2 = sqrt(av2/(count-1));
627
+ if (av2 > 0) {
628
+ av2 = sqrt(av2/(count-1));
629
+ }
665
630
  LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
666
631
  }
667
632
  }
@@ -686,10 +651,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
686
651
  return {tokens, ppl, logit_history, prob_history};
687
652
  }
688
653
 
689
- static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
654
+ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
690
655
  int prev_outputs = 0;
691
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
692
- const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
656
+ for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
657
+ const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
693
658
 
694
659
  llama_batch batch_view = {
695
660
  n_tokens,
@@ -699,7 +664,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
699
664
  batch.n_seq_id + i,
700
665
  batch.seq_id + i,
701
666
  batch.logits + i,
702
- 0, 0, 0, // unused
703
667
  };
704
668
 
705
669
  const int ret = llama_decode(ctx, batch_view);
@@ -713,7 +677,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
713
677
  n_outputs += batch_view.logits[i] != 0;
714
678
  }
715
679
 
716
- memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
680
+ memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
717
681
 
718
682
  prev_outputs += n_outputs;
719
683
  }
@@ -728,7 +692,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
728
692
  if (eval_results.size() != eval_pairs.size()) {
729
693
  eval_results.resize(eval_pairs.size());
730
694
  }
731
- if (eval_pairs.empty()) return;
695
+ if (eval_pairs.empty()) {
696
+ return;
697
+ }
732
698
 
733
699
  size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
734
700
 
@@ -736,11 +702,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
736
702
  auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
737
703
  float local_logprobs[K_TOKEN_CHUNK];
738
704
  while (true) {
739
- size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
740
- if (first >= eval_results.size()) break;
741
- size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
705
+ const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
706
+ if (first >= eval_results.size()) {
707
+ break;
708
+ }
709
+ const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
742
710
  for (size_t i = first; i < last; ++i) {
743
- auto logits = batch_logits + eval_pairs[i].first * n_vocab;
711
+ const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
744
712
  float max_logit = logits[0];
745
713
  for (int j = 1; j < n_vocab; ++j) {
746
714
  max_logit = std::max(max_logit, logits[j]);
@@ -763,7 +731,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
763
731
  }
764
732
  }
765
733
 
766
- static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
734
+ static void hellaswag_score(llama_context * ctx, const common_params & params) {
767
735
  // Calculates hellaswag score (acc_norm) from prompt
768
736
  //
769
737
  // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -844,7 +812,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
844
812
  hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
845
813
  for (size_t j = 0; j < 4; j++) {
846
814
  hs_cur.ending[j] = prompt_lines[idx*6+2+j];
847
- hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
815
+ hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
848
816
  }
849
817
 
850
818
  // determine the common prefix of the endings
@@ -877,10 +845,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
877
845
 
878
846
  double acc = 0.0f;
879
847
 
880
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
881
848
  const int n_ctx = llama_n_ctx(ctx);
882
849
  const int n_batch = params.n_batch;
883
850
 
851
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
852
+
884
853
  const int max_tasks_per_batch = 32;
885
854
  const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
886
855
 
@@ -888,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
888
857
 
889
858
  std::vector<float> tok_logits(n_vocab);
890
859
  // TODO: this could be made smaller; it's currently the worst-case size
891
- std::vector<float> batch_logits(n_vocab*n_ctx);
860
+ std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
892
861
 
893
862
  std::vector<std::pair<size_t, llama_token>> eval_pairs;
894
863
  std::vector<float> eval_results;
@@ -900,7 +869,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
900
869
  size_t i1 = i0;
901
870
  size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
902
871
 
903
- llama_batch_clear(batch);
872
+ common_batch_clear(batch);
904
873
 
905
874
  // batch as much tasks as possible into the available context
906
875
  // each task has 4 unique sequence ids - one for each ending
@@ -916,7 +885,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
916
885
  }
917
886
 
918
887
  for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
919
- llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
888
+ common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
920
889
  }
921
890
  batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
922
891
  n_logits += 1;
@@ -926,7 +895,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
926
895
  // TODO: don't evaluate the last token of each sequence
927
896
  for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
928
897
  const bool needs_logits = i < seq_tokens_size - 1;
929
- llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
898
+ common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
930
899
  n_logits += needs_logits;
931
900
  }
932
901
  }
@@ -975,7 +944,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
975
944
  auto & hs_cur = hs_data[i];
976
945
 
977
946
  // get the logits of the last token of the common prefix
978
- std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float));
947
+ std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
979
948
 
980
949
  const auto first_probs = softmax(tok_logits);
981
950
 
@@ -1102,7 +1071,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
1102
1071
  * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
1103
1072
  *
1104
1073
  */
1105
- static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1074
+ static void winogrande_score(llama_context * ctx, const common_params & params) {
1106
1075
 
1107
1076
  constexpr int k_min_trailing_ctx = 3;
1108
1077
 
@@ -1136,8 +1105,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1136
1105
  LOG_INF("%s : tokenizing selected tasks\n", __func__);
1137
1106
 
1138
1107
  for (auto & task : data) {
1139
- task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
1140
- task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
1108
+ task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
1109
+ task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
1141
1110
 
1142
1111
  task.common_prefix = 0;
1143
1112
  for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@@ -1152,16 +1121,17 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1152
1121
  task.seq_tokens[0].size() - task.common_prefix +
1153
1122
  task.seq_tokens[1].size() - task.common_prefix;
1154
1123
 
1155
- task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
1156
- task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
1124
+ task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
1125
+ task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
1157
1126
  }
1158
1127
 
1159
1128
  LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
1160
1129
 
1161
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1162
1130
  const int n_ctx = llama_n_ctx(ctx);
1163
1131
  const int n_batch = params.n_batch;
1164
1132
 
1133
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1134
+
1165
1135
  const int max_tasks_per_batch = 128;
1166
1136
  const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
1167
1137
 
@@ -1169,7 +1139,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1169
1139
 
1170
1140
  std::vector<float> tok_logits(n_vocab);
1171
1141
  // TODO: this could be made smaller; it's currently the worst-case size
1172
- std::vector<float> batch_logits(n_vocab*n_ctx);
1142
+ std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
1173
1143
 
1174
1144
  std::vector<std::pair<size_t, llama_token>> eval_pairs;
1175
1145
  std::vector<float> eval_results;
@@ -1184,7 +1154,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1184
1154
  size_t i1 = i0;
1185
1155
  size_t i_logits = 0;
1186
1156
 
1187
- llama_batch_clear(batch);
1157
+ common_batch_clear(batch);
1188
1158
 
1189
1159
  while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
1190
1160
  int n_logits = 0;
@@ -1194,7 +1164,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1194
1164
  }
1195
1165
 
1196
1166
  for (size_t i = 0; i < data[i1].common_prefix; ++i) {
1197
- llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
1167
+ common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
1198
1168
  }
1199
1169
  batch.logits[batch.n_tokens - 1] = true;
1200
1170
  n_logits += 1;
@@ -1202,7 +1172,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1202
1172
  for (int s = 0; s < 2; ++s) {
1203
1173
  // TODO: end before the last token, no need to predict past the end of the sequences
1204
1174
  for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
1205
- llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
1175
+ common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
1206
1176
  n_logits += 1;
1207
1177
  }
1208
1178
  }
@@ -1359,7 +1329,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
1359
1329
  }
1360
1330
  return false;
1361
1331
  }
1362
- task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
1332
+ task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
1363
1333
  }
1364
1334
  auto min_len = task.seq_tokens.front().size();
1365
1335
  for (auto& seq : task.seq_tokens) {
@@ -1403,7 +1373,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
1403
1373
  // git@hf.co:datasets/Stevross/mmlu
1404
1374
  // https://huggingface.co/datasets/truthful_qa
1405
1375
  //
1406
- static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
1376
+ static void multiple_choice_score(llama_context * ctx, const common_params & params) {
1407
1377
 
1408
1378
  std::istringstream strstream(params.prompt);
1409
1379
  uint32_t n_task;
@@ -1509,17 +1479,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1509
1479
 
1510
1480
  LOG("\ntask\tacc_norm\n");
1511
1481
 
1512
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1513
1482
  const int n_ctx = llama_n_ctx(ctx);
1514
1483
  const int n_batch = params.n_batch;
1515
1484
 
1485
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1486
+
1516
1487
  const int max_tasks_per_batch = 32;
1517
1488
  const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
1518
1489
 
1519
1490
  llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
1520
1491
 
1521
1492
  std::vector<float> tok_logits(n_vocab);
1522
- std::vector<float> batch_logits(n_vocab*n_ctx);
1493
+ std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
1523
1494
 
1524
1495
  std::vector<std::pair<size_t, llama_token>> eval_pairs;
1525
1496
  std::vector<float> eval_results;
@@ -1536,7 +1507,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1536
1507
  size_t i1 = i0;
1537
1508
  size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
1538
1509
 
1539
- llama_batch_clear(batch);
1510
+ common_batch_clear(batch);
1540
1511
 
1541
1512
  // batch as much tasks as possible into the available context
1542
1513
  // each task has 4 unique sequence ids - one for each ending
@@ -1559,7 +1530,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1559
1530
 
1560
1531
  for (size_t i = 0; i < cur_task.common_prefix; ++i) {
1561
1532
  //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
1562
- llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
1533
+ common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
1563
1534
  }
1564
1535
  batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
1565
1536
  n_logits += 1;
@@ -1569,7 +1540,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1569
1540
  // TODO: don't evaluate the last token of each sequence
1570
1541
  for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
1571
1542
  const bool needs_logits = i < seq_tokens_size - 1;
1572
- llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
1543
+ common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
1573
1544
  n_logits += needs_logits;
1574
1545
  }
1575
1546
  }
@@ -1627,7 +1598,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1627
1598
  //LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
1628
1599
 
1629
1600
  // get the logits of the last token of the common prefix
1630
- std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
1601
+ std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
1631
1602
 
1632
1603
  const auto first_probs = softmax(tok_logits);
1633
1604
 
@@ -1683,7 +1654,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1683
1654
  LOG_INF("\n");
1684
1655
  }
1685
1656
 
1686
- static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1657
+ static void kl_divergence(llama_context * ctx, const common_params & params) {
1687
1658
  if (params.logits_file.empty()) {
1688
1659
  LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
1689
1660
  return;
@@ -1709,7 +1680,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1709
1680
  __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
1710
1681
  }
1711
1682
 
1712
- int n_vocab, n_chunk;
1683
+ int n_vocab;
1684
+ int n_chunk;
1713
1685
  in.read((char *)&n_vocab, sizeof(n_vocab));
1714
1686
  in.read((char *)&n_chunk, sizeof(n_chunk));
1715
1687
  if (in.fail()) {
@@ -1720,7 +1692,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1720
1692
  LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
1721
1693
  }
1722
1694
 
1723
- std::vector<llama_token> tokens(n_ctx * n_chunk);
1695
+ std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
1724
1696
  if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
1725
1697
  LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
1726
1698
  return;
@@ -1737,7 +1709,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1737
1709
  std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
1738
1710
  std::vector<float> logits;
1739
1711
  if (num_batches > 1) {
1740
- logits.reserve(n_ctx * n_vocab);
1712
+ logits.reserve(size_t(n_ctx) * n_vocab);
1741
1713
  }
1742
1714
 
1743
1715
  std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@@ -1778,6 +1750,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1778
1750
  // clear the KV cache
1779
1751
  llama_kv_cache_clear(ctx);
1780
1752
 
1753
+ llama_batch batch = llama_batch_init(n_batch, 0, 1);
1754
+
1781
1755
  for (int j = 0; j < num_batches; ++j) {
1782
1756
  const int batch_start = start + j * n_batch;
1783
1757
  const int batch_size = std::min(end - batch_start, n_batch);
@@ -1790,9 +1764,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1790
1764
  tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
1791
1765
  }
1792
1766
 
1793
- // TODO: use llama_batch.logits instead of relying on logits_all == true
1794
- if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
1767
+ common_batch_clear(batch);
1768
+ for (int i = 0; i < batch_size; i++) {
1769
+ common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
1770
+ }
1771
+
1772
+ if (llama_decode(ctx, batch)) {
1795
1773
  LOG_ERR("%s : failed to eval\n", __func__);
1774
+ llama_batch_free(batch);
1796
1775
  return;
1797
1776
  }
1798
1777
 
@@ -1801,10 +1780,12 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1801
1780
 
1802
1781
  if (num_batches > 1) {
1803
1782
  const auto * batch_logits = llama_get_logits(ctx);
1804
- logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
1783
+ logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
1805
1784
  }
1806
1785
  }
1807
1786
 
1787
+ llama_batch_free(batch);
1788
+
1808
1789
  const auto t_end = std::chrono::high_resolution_clock::now();
1809
1790
 
1810
1791
  if (i == 0) {
@@ -1822,7 +1803,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1822
1803
 
1823
1804
  const int first = n_ctx/2;
1824
1805
  const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
1825
- process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
1806
+ process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
1826
1807
  workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
1827
1808
  p_diff_ptr += n_ctx - 1 - first;
1828
1809
  kld_ptr += n_ctx - 1 - first;
@@ -1955,17 +1936,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1955
1936
  }
1956
1937
 
1957
1938
  int main(int argc, char ** argv) {
1958
- gpt_params params;
1939
+ common_params params;
1959
1940
 
1960
1941
  params.n_ctx = 512;
1961
1942
  params.logits_all = true;
1962
1943
  params.escape = false;
1963
1944
 
1964
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
1945
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
1965
1946
  return 1;
1966
1947
  }
1967
1948
 
1968
- gpt_init();
1949
+ common_init();
1969
1950
 
1970
1951
  const int32_t n_ctx = params.n_ctx;
1971
1952
 
@@ -2004,7 +1985,7 @@ int main(int argc, char ** argv) {
2004
1985
  llama_numa_init(params.numa);
2005
1986
 
2006
1987
  // load the model and apply lora adapter, if any
2007
- llama_init_result llama_init = llama_init_from_gpt_params(params);
1988
+ common_init_result llama_init = common_init_from_params(params);
2008
1989
 
2009
1990
  llama_model * model = llama_init.model;
2010
1991
  llama_context * ctx = llama_init.context;
@@ -2023,7 +2004,7 @@ int main(int argc, char ** argv) {
2023
2004
  // print system information
2024
2005
  {
2025
2006
  LOG_INF("\n");
2026
- LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
2007
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
2027
2008
  }
2028
2009
 
2029
2010
  struct results_perplexity results;
@@ -2042,8 +2023,6 @@ int main(int argc, char ** argv) {
2042
2023
  LOG("\n");
2043
2024
  llama_perf_context_print(ctx);
2044
2025
 
2045
- write_logfile(ctx, params, model, results);
2046
-
2047
2026
  llama_free(ctx);
2048
2027
  llama_free_model(model);
2049
2028
 
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_include_directories(${TARGET} PRIVATE ../../common)
6
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
6
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
48
48
  { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
49
49
  { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
50
50
  { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
51
- { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52
- { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
53
- { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
54
51
  { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
55
52
  { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
56
53
  { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_include_directories(${TARGET} PRIVATE ../../common)
6
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
6
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)