@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -0,0 +1,25 @@
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include <memory>
8
+
9
+ #include "llama.h"
10
+
11
+ struct llama_model_deleter {
12
+ void operator()(llama_model * model) { llama_free_model(model); }
13
+ };
14
+
15
+ struct llama_context_deleter {
16
+ void operator()(llama_context * context) { llama_free(context); }
17
+ };
18
+
19
+ struct llama_sampler_deleter {
20
+ void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
+ };
22
+
23
+ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
24
+ typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
25
+ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
@@ -2,6 +2,7 @@
2
2
  #define LLAMA_H
3
3
 
4
4
  #include "ggml.h"
5
+ #include "ggml-cpu.h"
5
6
  #include "ggml-backend.h"
6
7
 
7
8
  #include <stddef.h>
@@ -103,12 +104,15 @@ extern "C" {
103
104
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
104
105
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
105
106
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
107
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
106
108
  };
107
109
 
108
110
  enum llama_rope_type {
109
- LLAMA_ROPE_TYPE_NONE = -1,
110
- LLAMA_ROPE_TYPE_NORM = 0,
111
- LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
111
+ LLAMA_ROPE_TYPE_NONE = -1,
112
+ LLAMA_ROPE_TYPE_NORM = 0,
113
+ LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
114
+ LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
115
+ LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
112
116
  };
113
117
 
114
118
  enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -170,9 +174,9 @@ extern "C" {
170
174
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
171
175
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
172
176
  LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
173
- LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
174
- LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
175
- LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
177
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
178
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
179
+ //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
176
180
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
177
181
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
178
182
 
@@ -184,7 +188,8 @@ extern "C" {
184
188
  LLAMA_ROPE_SCALING_TYPE_NONE = 0,
185
189
  LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
186
190
  LLAMA_ROPE_SCALING_TYPE_YARN = 2,
187
- LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
191
+ LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
192
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
188
193
  };
189
194
 
190
195
  enum llama_pooling_type {
@@ -205,7 +210,7 @@ extern "C" {
205
210
  enum llama_split_mode {
206
211
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
207
212
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
208
- LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
213
+ LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
209
214
  };
210
215
 
211
216
  // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -217,6 +222,7 @@ extern "C" {
217
222
 
218
223
  typedef struct llama_token_data_array {
219
224
  // TODO: consider SoA
225
+ // NOTE: this pointer can be modified by the samplers
220
226
  llama_token_data * data;
221
227
  size_t size;
222
228
  int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -232,8 +238,11 @@ extern "C" {
232
238
  // - token : the token ids of the input (used when embd is NULL)
233
239
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
234
240
  // - pos : the positions of the respective token in the sequence
241
+ // (if set to NULL, the token position will be tracked automatically by llama_decode)
235
242
  // - seq_id : the sequence to which the respective token belongs
243
+ // (if set to NULL, the sequence ID will be assumed to be 0)
236
244
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
245
+ // (if set to NULL, only the logits for last token will be returned)
237
246
  //
238
247
  typedef struct llama_batch {
239
248
  int32_t n_tokens;
@@ -244,15 +253,6 @@ extern "C" {
244
253
  int32_t * n_seq_id;
245
254
  llama_seq_id ** seq_id;
246
255
  int8_t * logits; // TODO: rename this to "output"
247
-
248
- // NOTE: helpers for smooth API transition - can be deprecated in the future
249
- // for future-proof code, use the above fields instead and ignore everything below
250
- //
251
- // pos[i] = all_pos_0 + i*all_pos_1
252
- //
253
- llama_pos all_pos_0; // used if pos == NULL
254
- llama_pos all_pos_1; // used if pos == NULL
255
- llama_seq_id all_seq_id; // used if seq_id == NULL
256
256
  } llama_batch;
257
257
 
258
258
  enum llama_model_kv_override_type {
@@ -276,13 +276,13 @@ extern "C" {
276
276
  };
277
277
 
278
278
  struct llama_model_params {
279
+ // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
280
+ ggml_backend_dev_t * devices;
281
+
279
282
  int32_t n_gpu_layers; // number of layers to store in VRAM
280
283
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
281
284
 
282
- // main_gpu interpretation depends on split_mode:
283
- // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
284
- // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
285
- // LLAMA_SPLIT_MODE_LAYER: ignored
285
+ // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
286
286
  int32_t main_gpu;
287
287
 
288
288
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@@ -433,6 +433,7 @@ extern "C" {
433
433
  LLAMA_API bool llama_supports_mmap (void);
434
434
  LLAMA_API bool llama_supports_mlock (void);
435
435
  LLAMA_API bool llama_supports_gpu_offload(void);
436
+ LLAMA_API bool llama_supports_rpc (void);
436
437
 
437
438
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
438
439
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
@@ -457,6 +458,7 @@ extern "C" {
457
458
  // Functions to access the model's GGUF metadata scalar values
458
459
  // - The functions return the length of the string on success, or -1 on failure
459
460
  // - The output string is always null-terminated and cleared on failure
461
+ // - When retrieving a string, an extra byte must be allocated to account for the null terminator
460
462
  // - GGUF array values are not supported by these functions
461
463
 
462
464
  // Get metadata value as a string by key name
@@ -480,9 +482,6 @@ extern "C" {
480
482
  // Returns the total number of parameters in the model
481
483
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
482
484
 
483
- // Get a llama model tensor
484
- LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
485
-
486
485
  // Returns true if the model contains an encoder that requires llama_encode() call
487
486
  LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
488
487
 
@@ -673,6 +672,9 @@ extern "C" {
673
672
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
674
673
  LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
675
674
 
675
+ // Check if the context supports KV cache shifting
676
+ LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
677
+
676
678
  //
677
679
  // State / sessions
678
680
  //
@@ -775,15 +777,15 @@ extern "C" {
775
777
  // Decoding
776
778
  //
777
779
 
778
- // Return batch for single sequence of tokens starting at pos_0
780
+ // Return batch for single sequence of tokens
781
+ // The sequence ID will be fixed to 0
782
+ // The position of the tokens will be tracked automatically by llama_decode
779
783
  //
780
784
  // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
781
785
  //
782
786
  LLAMA_API struct llama_batch llama_batch_get_one(
783
787
  llama_token * tokens,
784
- int32_t n_tokens,
785
- llama_pos pos_0,
786
- llama_seq_id seq_id);
788
+ int32_t n_tokens);
787
789
 
788
790
  // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
789
791
  // Each token can be assigned up to n_seq_max sequence ids
@@ -803,7 +805,7 @@ extern "C" {
803
805
  // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
804
806
  // Stores the encoder output internally for later use by the decoder cross-attention layers.
805
807
  // 0 - success
806
- // < 0 - error
808
+ // < 0 - error. the KV cache state is restored to the state before this call
807
809
  LLAMA_API int32_t llama_encode(
808
810
  struct llama_context * ctx,
809
811
  struct llama_batch batch);
@@ -811,7 +813,7 @@ extern "C" {
811
813
  // Positive return values does not mean a fatal error, but rather a warning.
812
814
  // 0 - success
813
815
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
814
- // < 0 - error
816
+ // < 0 - error. the KV cache state is restored to the state before this call
815
817
  LLAMA_API int32_t llama_decode(
816
818
  struct llama_context * ctx,
817
819
  struct llama_batch batch);
@@ -896,6 +898,7 @@ extern "C" {
896
898
  // Special tokens
897
899
  LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
898
900
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
901
+ LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
899
902
  LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
900
903
  LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
901
904
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@@ -904,11 +907,17 @@ extern "C" {
904
907
  LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
905
908
  LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
906
909
 
907
- // Codellama infill tokens
908
- LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
909
- LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
910
- LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
911
- LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
910
+ // infill tokens
911
+ DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
912
+ DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
913
+ DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
914
+
915
+ LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
916
+ LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
917
+ LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
918
+ LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
919
+ LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
920
+ LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
912
921
 
913
922
  //
914
923
  // Tokenization
@@ -983,6 +992,9 @@ extern "C" {
983
992
  char * buf,
984
993
  int32_t length);
985
994
 
995
+ // Get list of built-in chat templates
996
+ LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
997
+
986
998
  //
987
999
  // Sampling API
988
1000
  //
@@ -1067,12 +1079,13 @@ extern "C" {
1067
1079
 
1068
1080
  // available samplers:
1069
1081
 
1070
- LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
1071
- LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1082
+ LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1083
+ LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1072
1084
 
1073
1085
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1074
1086
  /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1075
- LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
1087
+ DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1088
+ "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
1076
1089
 
1077
1090
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1078
1091
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1083,16 +1096,18 @@ extern "C" {
1083
1096
  /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1084
1097
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1085
1098
 
1086
- /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1087
- LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
1088
-
1089
1099
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1090
1100
  LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
1101
+
1102
+ /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
1091
1103
  LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
1092
1104
 
1093
1105
  /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
1094
1106
  LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
1095
1107
 
1108
+ /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1109
+ LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1110
+
1096
1111
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1097
1112
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1098
1113
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1121,22 +1136,50 @@ extern "C" {
1121
1136
  const char * grammar_str,
1122
1137
  const char * grammar_root);
1123
1138
 
1139
+ /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1124
1140
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1125
- int32_t n_vocab, // llama_n_vocab()
1126
- llama_token special_eos_id, // llama_token_eos()
1127
- llama_token linefeed_id, // llama_token_nl()
1128
- int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1129
- float penalty_repeat, // 1.0 = disabled
1130
- float penalty_freq, // 0.0 = disabled
1131
- float penalty_present, // 0.0 = disabled
1132
- bool penalize_nl, // consider newlines as a repeatable token
1133
- bool ignore_eos); // ignore the end-of-sequence token
1141
+ int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1142
+ float penalty_repeat, // 1.0 = disabled
1143
+ float penalty_freq, // 0.0 = disabled
1144
+ float penalty_present); // 0.0 = disabled
1145
+
1146
+ /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1147
+ LLAMA_API struct llama_sampler * llama_sampler_init_dry(
1148
+ const struct llama_model * model,
1149
+ float dry_multiplier,
1150
+ float dry_base,
1151
+ int32_t dry_allowed_length,
1152
+ int32_t dry_penalty_last_n,
1153
+ const char ** seq_breakers,
1154
+ size_t num_breakers);
1134
1155
 
1135
1156
  LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
1136
1157
  int32_t n_vocab,
1137
1158
  int32_t n_logit_bias,
1138
1159
  const llama_logit_bias * logit_bias);
1139
1160
 
1161
+ // this sampler is meant to be used for fill-in-the-middle infilling
1162
+ // it's supposed to be used after top_k + top_p sampling
1163
+ //
1164
+ // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
1165
+ // 2. combine probs of tokens that have the same prefix
1166
+ //
1167
+ // example:
1168
+ //
1169
+ // - before:
1170
+ // "hel": 0.5
1171
+ // "hell": 0.2
1172
+ // "hello": 0.1
1173
+ // "dummy": 0.1
1174
+ //
1175
+ // - after:
1176
+ // "hel": 0.8
1177
+ // "dummy": 0.1
1178
+ //
1179
+ // 3. discard non-EOG tokens with low prob
1180
+ // 4. if no tokens are left -> pick EOT
1181
+ //
1182
+ LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
1140
1183
 
1141
1184
  // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1142
1185
  LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
@@ -1208,8 +1251,6 @@ extern "C" {
1208
1251
  LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1209
1252
  LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1210
1253
 
1211
- LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
1212
-
1213
1254
  #ifdef __cplusplus
1214
1255
  }
1215
1256
  #endif
@@ -0,0 +1,112 @@
1
+ ied 4 ½ months
2
+ __ggml_vocab_test__
3
+ Führer
4
+ __ggml_vocab_test__
5
+
6
+ __ggml_vocab_test__
7
+
8
+ __ggml_vocab_test__
9
+
10
+ __ggml_vocab_test__
11
+
12
+ __ggml_vocab_test__
13
+
14
+ __ggml_vocab_test__
15
+
16
+
17
+ __ggml_vocab_test__
18
+
19
+
20
+
21
+ __ggml_vocab_test__
22
+
23
+
24
+
25
+
26
+ __ggml_vocab_test__
27
+
28
+
29
+ __ggml_vocab_test__
30
+ Hello world
31
+ __ggml_vocab_test__
32
+ Hello world
33
+ __ggml_vocab_test__
34
+ Hello World
35
+ __ggml_vocab_test__
36
+ Hello World
37
+ __ggml_vocab_test__
38
+ Hello World!
39
+ __ggml_vocab_test__
40
+ Hello, world!
41
+ __ggml_vocab_test__
42
+ Hello, world!
43
+ __ggml_vocab_test__
44
+ this is 🦙.cpp
45
+ __ggml_vocab_test__
46
+ w048 7tuijk dsdfhu
47
+ __ggml_vocab_test__
48
+ нещо на Български
49
+ __ggml_vocab_test__
50
+ កាន់តែពិសេសអាចខលចេញ
51
+ __ggml_vocab_test__
52
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53
+ __ggml_vocab_test__
54
+ Hello
55
+ __ggml_vocab_test__
56
+ Hello
57
+ __ggml_vocab_test__
58
+ Hello
59
+ __ggml_vocab_test__
60
+ Hello
61
+ __ggml_vocab_test__
62
+ Hello
63
+ __ggml_vocab_test__
64
+ Hello
65
+ Hello
66
+ __ggml_vocab_test__
67
+ (
68
+ __ggml_vocab_test__
69
+
70
+ =
71
+ __ggml_vocab_test__
72
+ ' era
73
+ __ggml_vocab_test__
74
+ Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75
+ __ggml_vocab_test__
76
+ !!!!!!
77
+ __ggml_vocab_test__
78
+ 3
79
+ __ggml_vocab_test__
80
+ 33
81
+ __ggml_vocab_test__
82
+ 333
83
+ __ggml_vocab_test__
84
+ 3333
85
+ __ggml_vocab_test__
86
+ 33333
87
+ __ggml_vocab_test__
88
+ 333333
89
+ __ggml_vocab_test__
90
+ 3333333
91
+ __ggml_vocab_test__
92
+ 33333333
93
+ __ggml_vocab_test__
94
+ 333333333
95
+ __ggml_vocab_test__
96
+ Cửa Việt
97
+ __ggml_vocab_test__
98
+ discards
99
+ __ggml_vocab_test__
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112
+ __ggml_vocab_test__
@@ -0,0 +1,46 @@
1
+ 2550 204 18430 377
2
+ 597 2768 298 8564
3
+
4
+ 1437
5
+ 1437 1437
6
+ 1437 1437 1437
7
+ 50117
8
+ 50118
9
+ 50140
10
+ 50140 50118
11
+ 50117 50118
12
+ 31414 232
13
+ 20920 232
14
+ 31414 623
15
+ 20920 623
16
+ 20920 623 328
17
+ 31414 6 232 328
18
+ 20920 6 232 328
19
+ 42 16 8103 18164 27 4 49317
20
+ 605 40976 262 10109 18474 385 29 36807 6455
21
+ 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328
22
+ 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171
23
+ 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43
24
+ 31414
25
+ 20920
26
+ 1437 20920
27
+ 1437 1437 20920
28
+ 1437 1437 1437 20920
29
+ 1437 1437 1437 20920 50118 1437 1437 1437 20920
30
+ 36
31
+ 50118 5457
32
+ 108 3567
33
+ 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772
34
+ 32376 12846
35
+ 246
36
+ 3103
37
+ 25631
38
+ 46152
39
+ 3103 25631
40
+ 46152 3103
41
+ 46152 25631
42
+ 46152 46152
43
+ 46152 3103 25631
44
+ 347 1376 2023 12410 102 16376 1376 2023 6382 90
45
+ 9553 5954
46
+ 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574
@@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
8
8
 
9
9
  if (EMSCRIPTEN)
10
10
  else()
11
- add_subdirectory(vdot)
11
+ if (NOT GGML_BACKEND_DL)
12
+ add_subdirectory(vdot)
13
+ endif()
12
14
  endif()
@@ -1,9 +1,9 @@
1
1
  set(TARGET llama-vdot)
2
2
  add_executable(${TARGET} vdot.cpp)
3
3
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
4
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
5
5
 
6
6
  set(TARGET llama-q8dot)
7
7
  add_executable(${TARGET} q8dot.cpp)
8
8
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
9
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
9
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -11,6 +11,7 @@
11
11
  #include <type_traits>
12
12
 
13
13
  #include <ggml.h>
14
+ #include <ggml-cpu.h>
14
15
 
15
16
  constexpr int kVecSize = 1 << 16;
16
17
 
@@ -136,7 +137,7 @@ int main(int argc, char** argv) {
136
137
 
137
138
  auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
138
139
 
139
- auto funcs = ggml_internal_get_type_traits(ggml_type);
140
+ const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
140
141
 
141
142
  Stat simple, ggml;
142
143
 
@@ -156,8 +157,8 @@ int main(int argc, char** argv) {
156
157
 
157
158
  t1 = std::chrono::high_resolution_clock::now();
158
159
  float fs;
159
- if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
160
- else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
160
+ if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
161
+ else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
161
162
  t2 = std::chrono::high_resolution_clock::now();
162
163
  t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
163
164
  if (iloop > 3) ggml.addResult(fs, t);
@@ -9,6 +9,7 @@
9
9
  #include <array>
10
10
 
11
11
  #include <ggml.h>
12
+ #include <ggml-cpu.h>
12
13
 
13
14
  #if defined(_MSC_VER)
14
15
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -236,7 +237,7 @@ int main(int argc, char** argv) {
236
237
  int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
237
238
  int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
238
239
 
239
- auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
240
+ const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
240
241
 
241
242
  std::vector<block_q4_0> q40;
242
243
  std::vector<block_q4_1> q41;
@@ -261,9 +262,9 @@ int main(int argc, char** argv) {
261
262
  // Note, we do not include this in the timing as in practical application
262
263
  // we already have the quantized model weights.
263
264
  if (useQ4_1) {
264
- funcs.from_float(x1.data(), q41.data(), kVecSize);
265
+ funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
265
266
  } else {
266
- funcs.from_float(x1.data(), q40.data(), kVecSize);
267
+ funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
267
268
  }
268
269
 
269
270
  // Now measure time the dot product needs using the "scalar" version above
@@ -282,10 +283,10 @@ int main(int argc, char** argv) {
282
283
  dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
283
284
  }
284
285
  else {
285
- auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
286
- vdot.from_float(y1.data(), q8.data(), kVecSize);
287
- if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
288
- else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
286
+ const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
287
+ vdot->from_float(y1.data(), q8.data(), kVecSize);
288
+ if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
289
+ else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
289
290
  }
290
291
  sumq += result;
291
292
  t2 = std::chrono::high_resolution_clock::now();
@@ -1,9 +1,4 @@
1
- # TODO: should not use this
2
- if (WIN32)
3
- if (BUILD_SHARED_LIBS)
4
- set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
5
- endif()
6
- endif()
1
+ llama_add_compile_flags()
7
2
 
8
3
  #
9
4
  # libraries
@@ -23,11 +18,12 @@ add_library(llama
23
18
  )
24
19
 
25
20
  target_include_directories(llama PUBLIC . ../include)
26
- target_compile_features (llama PUBLIC cxx_std_11) # don't bump
21
+ target_compile_features (llama PUBLIC cxx_std_17) # don't bump
27
22
 
28
23
  target_link_libraries(llama PUBLIC ggml)
29
24
 
30
25
  if (BUILD_SHARED_LIBS)
31
26
  set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
32
- target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
27
+ target_compile_definitions(llama PRIVATE LLAMA_BUILD)
28
+ target_compile_definitions(llama PUBLIC LLAMA_SHARED)
33
29
  endif()