@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,325 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+ #include "ggml.h"
4
+
5
+ #ifdef GGML_USE_CUDA
6
+ #include "ggml-cuda.h"
7
+ #endif
8
+
9
+ #ifdef GGML_USE_METAL
10
+ #include "ggml-metal.h"
11
+ #endif
12
+
13
+ #include <cstdio>
14
+ #include <ctime>
15
+ #include <string>
16
+ #include <tuple>
17
+ #include <vector>
18
+ #include <algorithm>
19
+ #include <iostream>
20
+ #include <fstream>
21
+
22
+ #define DEBUG_POS 5
23
+
24
+ static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
25
+ printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
26
+ if (!with_data) return;
27
+ printf("%s: %s[0] = [", __func__, t->name);
28
+ for (size_t i = 0; i <= DEBUG_POS; i++) {
29
+ printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
30
+ }
31
+ printf(" ... ]\n");
32
+ }
33
+
34
+ namespace PCA {
35
+
36
+ // input params for PCA computations
37
+ struct pca_params {
38
+ int n_threads = 1;
39
+ int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
40
+ int n_iterations = 1000;
41
+ float tolerance = 1e-7;
42
+
43
+ // for debugging
44
+ int i_layer = 0;
45
+ int n_layers = 0;
46
+ };
47
+
48
+ // result from each iteration
49
+ struct pca_result {
50
+ struct ggml_tensor * calculated_square = NULL;
51
+ std::vector<struct ggml_tensor *> eigenvectors;
52
+ std::vector<float> distances;
53
+ };
54
+
55
+ struct pca_model {
56
+ ggml_backend_t backend = NULL;
57
+ ggml_backend_buffer_t buffer;
58
+ struct ggml_context * ctx; // context to compute graph on target device
59
+ struct ggml_context * ctx_host; // host context to store results
60
+
61
+ // tensors on target device
62
+ struct ggml_tensor * dev_input;
63
+ struct ggml_tensor * dev_square;
64
+ struct ggml_tensor * dev_eigenvector;
65
+
66
+ pca_model(struct ggml_tensor * t_input) {
67
+ #ifdef GGML_USE_CUDA
68
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
69
+ backend = ggml_backend_cuda_init(0); // init device 0
70
+ if (!backend) {
71
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
72
+ }
73
+ #endif
74
+
75
+ // TODO: enable Metal support when support for GGML_OP_SQRT is added
76
+ // #ifdef GGML_USE_METAL
77
+ // fprintf(stderr, "%s: using Metal backend\n", __func__);
78
+ // backend = ggml_backend_metal_init();
79
+ // if (!backend) {
80
+ // fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
81
+ // }
82
+ // #endif
83
+
84
+ // if there aren't GPU Backends fallback to CPU backend
85
+ if (!backend) {
86
+ backend = ggml_backend_cpu_init();
87
+ }
88
+
89
+ const int num_tensors = 4;
90
+ struct ggml_init_params params {
91
+ /*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
92
+ /*.mem_buffer =*/ NULL,
93
+ /*.no_alloc =*/ true,
94
+ };
95
+ ctx = ggml_init(params);
96
+
97
+ auto n_samples = t_input->ne[0];
98
+ auto n_embd = t_input->ne[1];
99
+
100
+ dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
101
+ dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
102
+ dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
103
+
104
+ ggml_set_name(dev_input, "dev_input");
105
+ ggml_set_name(dev_square, "dev_square");
106
+ ggml_set_name(dev_eigenvector, "dev_eigenvector");
107
+ buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
108
+ ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
109
+
110
+ // initialize eigenvector to random normalized vector
111
+ {
112
+ std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
113
+ std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
114
+ std::uniform_real_distribution<float> distribution(0.0, 1.0);
115
+ float sum_sqr = 0.0; // for normalizing random_vec
116
+ for (size_t i = 0; i < random_vec.size(); ++i) {
117
+ float f = distribution(generator);
118
+ sum_sqr += f * f;
119
+ random_vec[i] = f;
120
+ }
121
+ // normalize it
122
+ float random_vec_norm = std::sqrt(sum_sqr);
123
+ for (size_t i = 0; i < random_vec.size(); ++i) {
124
+ random_vec[i] /= random_vec_norm;
125
+ }
126
+ ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
127
+ }
128
+ }
129
+
130
+ ~pca_model() {
131
+ ggml_free(ctx);
132
+ ggml_backend_buffer_free(buffer);
133
+ ggml_backend_free(backend);
134
+ }
135
+ };
136
+
137
+ static struct ggml_cgraph * build_graph_piter(
138
+ const struct pca_params & params,
139
+ const pca_model & model,
140
+ bool calc_square = false) {
141
+ GGML_ASSERT(params.n_batch > 0);
142
+ // TODO: buf_size must be able to scale with params.n_batch
143
+ static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
144
+ static std::vector<uint8_t> buf(buf_size);
145
+
146
+ struct ggml_init_params params0 = {
147
+ /*.mem_size =*/ buf_size,
148
+ /*.mem_buffer =*/ buf.data(),
149
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
150
+ };
151
+ // create a temporally context to build the graph
152
+ struct ggml_context * ctx0 = ggml_init(params0);
153
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
154
+
155
+ // turn v_diff_original into square matrix if needed
156
+ struct ggml_tensor * tmp_square;
157
+ if (calc_square) {
158
+ tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
159
+ ggml_set_name(tmp_square, "tmp_square");
160
+ }
161
+
162
+ struct ggml_tensor * b_tensor;
163
+ struct ggml_tensor * distance;
164
+ struct ggml_tensor * old_eigen = model.dev_eigenvector;
165
+ struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
166
+
167
+ for (int i = 0; i < params.n_batch; ++i) {
168
+ // b_tensor = square * eigenvector^T
169
+ b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
170
+ ggml_set_name(b_tensor, "b_tensor");
171
+
172
+ // normalize
173
+ b_tensor = ggml_div_inplace(ctx0,
174
+ b_tensor,
175
+ ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
176
+ );
177
+ ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
178
+
179
+ // calculate distance(new eigenvector - old eigenvector)
180
+ // we don't use ggml_sub because it may not be implemented on GPU backend
181
+ struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
182
+ distance = ggml_sqrt_inplace(ctx0,
183
+ ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
184
+ ggml_format_name(distance, "distance_%d", i);
185
+
186
+ old_eigen = b_tensor;
187
+
188
+ // build operations nodes
189
+ ggml_build_forward_expand(gf, distance);
190
+ }
191
+
192
+ // delete the temporally context used to build the graph
193
+ ggml_free(ctx0);
194
+ return gf;
195
+ }
196
+
197
+ static ggml_status compute_piter(
198
+ const struct pca_params & params,
199
+ const pca_model & model,
200
+ struct ggml_cgraph * gf,
201
+ ggml_gallocr_t allocr,
202
+ struct pca_result & result) {
203
+ // allocate tensors
204
+ ggml_gallocr_alloc_graph(allocr, gf);
205
+
206
+ if (ggml_backend_is_cpu(model.backend)) {
207
+ ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
208
+ }
209
+
210
+ // TODO: enable GPU support when support for GGML_OP_SQRT is added
211
+ //#ifdef GGML_USE_METAL
212
+ // if (ggml_backend_is_metal(model.backend)) {
213
+ // ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
214
+ // }
215
+ //#endif
216
+
217
+ ggml_status res = ggml_backend_graph_compute(model.backend, gf);
218
+ if (res == GGML_STATUS_SUCCESS) {
219
+ auto extract_i = [](std::string prefix, std::string str) -> int {
220
+ int i = -1;
221
+ if (str.rfind(prefix, 0) == 0) {
222
+ sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
223
+ }
224
+ return i;
225
+ };
226
+ result.calculated_square = NULL;
227
+ result.eigenvectors.clear();
228
+ result.distances.clear();
229
+ result.eigenvectors.resize(params.n_batch);
230
+ result.distances.resize(params.n_batch);
231
+ // get output nodes
232
+ for (int i = 0; i < gf->n_nodes; ++i) {
233
+ auto node = gf->nodes[i];
234
+ int iter = -1;
235
+ // find b_tensor (without copying data from device)
236
+ if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
237
+ result.eigenvectors[iter] = node;
238
+ }
239
+ // find distances, then copy data from device
240
+ if ((iter = extract_i("distance_", node->name)) > -1) {
241
+ float d;
242
+ ggml_backend_tensor_get(node, &d, 0, sizeof(float));
243
+ result.distances[iter] = d;
244
+ // std::cout << node->name << " = " << d << "\n";
245
+ }
246
+ // find tmp_square if it exists (without copying data from device)
247
+ if (std::string(node->name) == "tmp_square") {
248
+ result.calculated_square = node;
249
+ }
250
+ }
251
+ }
252
+ return res;
253
+ }
254
+
255
+ static void power_iteration(
256
+ const struct pca_params & params,
257
+ struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
258
+ struct ggml_tensor * output) {
259
+ //printf("in power iteration\n");
260
+ struct pca_model model(input);
261
+
262
+ ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
263
+ struct pca_result result;
264
+ struct ggml_tensor * last_eigenvector = NULL;
265
+
266
+ int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
267
+ for (int iter = 0; iter < n_iters; ++iter) {
268
+ bool calc_square = (iter == 0); // only need to calculate square for first iteration
269
+ struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
270
+ // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
271
+ compute_piter(params, model, gf, allocr, result);
272
+
273
+ for (size_t k = 0; k < result.distances.size(); ++k) {
274
+ last_eigenvector = result.eigenvectors[k];
275
+ if (result.distances[k] < params.tolerance) {
276
+ break; // done
277
+ }
278
+ }
279
+
280
+ if (calc_square) {
281
+ // copy and store the square matrix if needed
282
+ GGML_ASSERT(result.calculated_square != NULL);
283
+ ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
284
+ }
285
+
286
+ {
287
+ // copy last eigen vector and store as input for next iteration
288
+ GGML_ASSERT(last_eigenvector != NULL);
289
+ ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
290
+ }
291
+
292
+ printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
293
+ __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
294
+ }
295
+
296
+ // get output tensor
297
+ GGML_ASSERT(last_eigenvector);
298
+ ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
299
+ //print_debug_tensor(output);
300
+ ggml_gallocr_free(allocr);
301
+
302
+ // TODO @ngxson : The output vector is randomly inverted
303
+ // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
304
+ }
305
+
306
+ static void run_pca(
307
+ struct pca_params & params,
308
+ const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
309
+ const std::vector<struct ggml_tensor *> & v_output) {
310
+ printf("%s: Running PCA...\n", __func__);
311
+ for (size_t il = 0; il < v_input.size(); ++il) {
312
+
313
+ // prepare output vector
314
+ struct ggml_tensor * ctrl_out = v_output[il];
315
+ ggml_format_name(ctrl_out, "direction.%ld", il+1);
316
+
317
+ // run power_iteration
318
+ params.i_layer = il;
319
+ params.n_layers = v_input.size();
320
+ power_iteration(params, v_input[il], ctrl_out);
321
+ printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
322
+ }
323
+ }
324
+
325
+ }
@@ -0,0 +1,4 @@
1
+ <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
2
+ <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
3
+ <|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
4
+ <|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
@@ -0,0 +1,35 @@
1
+ // Warns users that this filename was deprecated, and provides a link for more information.
2
+
3
+ #include <cstdio>
4
+ #include <string>
5
+ #include <unordered_map>
6
+
7
+ // Main
8
+ int main(int argc, char** argv) {
9
+ std::string filename = "main";
10
+ if (argc >= 1) {
11
+ filename = argv[0];
12
+ }
13
+
14
+ // Get only the program name from the full path
15
+ auto pos = filename.find_last_of('/');
16
+ if (pos != std::string::npos) {
17
+ filename = filename.substr(pos+1);
18
+ }
19
+
20
+ // Append "llama-" to the beginning of filename to get the replacemnt filename
21
+ auto replacement_filename = "llama-" + filename;
22
+
23
+ // The exception is if the filename is "main", then our replacement filename is "llama-cli"
24
+ if (filename == "main") {
25
+ replacement_filename = "llama-cli";
26
+ }
27
+
28
+ fprintf(stdout, "\n");
29
+ fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
30
+ fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
31
+ fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
32
+ fprintf(stdout, "\n");
33
+
34
+ return EXIT_FAILURE;
35
+ }
@@ -1,4 +1,4 @@
1
- set(TARGET embedding)
1
+ set(TARGET llama-embedding)
2
2
  add_executable(${TARGET} embedding.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -7,23 +7,30 @@
7
7
  #pragma warning(disable: 4244 4267) // possible loss of data
8
8
  #endif
9
9
 
10
- static std::vector<std::string> split_lines(const std::string & s) {
11
- std::string line;
10
+ static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
12
11
  std::vector<std::string> lines;
13
- std::stringstream ss(s);
14
- while (std::getline(ss, line)) {
15
- lines.push_back(line);
12
+ size_t start = 0;
13
+ size_t end = s.find(separator);
14
+
15
+ while (end != std::string::npos) {
16
+ lines.push_back(s.substr(start, end - start));
17
+ start = end + separator.length();
18
+ end = s.find(separator, start);
16
19
  }
20
+
21
+ lines.push_back(s.substr(start)); // Add the last part
22
+
17
23
  return lines;
18
24
  }
19
25
 
20
- static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
21
- for (size_t i = 0; i < tokens.size(); i++) {
22
- llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
26
+ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
27
+ size_t n_tokens = tokens.size();
28
+ for (size_t i = 0; i < n_tokens; i++) {
29
+ llama_batch_add(batch, tokens[i], i, { seq_id }, true);
23
30
  }
24
31
  }
25
32
 
26
- static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
33
+ static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
27
34
  // clear previous kv_cache values (irrelevant for embeddings)
28
35
  llama_kv_cache_clear(ctx);
29
36
 
@@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
40
47
 
41
48
  // try to get sequence embeddings - supported only when pooling_type is not NONE
42
49
  const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
43
- if (embd == NULL) {
44
- embd = llama_get_embeddings_ith(ctx, i);
45
- if (embd == NULL) {
46
- fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
47
- continue;
48
- }
49
- }
50
+ GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
50
51
 
51
52
  float * out = output + batch.seq_id[i][0] * n_embd;
52
- //TODO: I would also add a parameter here to enable normalization or not.
53
- /*fprintf(stdout, "unnormalized_embedding:");
54
- for (int hh = 0; hh < n_embd; hh++) {
55
- fprintf(stdout, "%9.6f ", embd[hh]);
56
- }
57
- fprintf(stdout, "\n");*/
58
- llama_embd_normalize(embd, out, n_embd);
53
+ llama_embd_normalize(embd, out, n_embd, embd_norm);
59
54
  }
60
55
  }
61
56
 
@@ -63,6 +58,7 @@ int main(int argc, char ** argv) {
63
58
  gpt_params params;
64
59
 
65
60
  if (!gpt_params_parse(argc, argv, params)) {
61
+ gpt_params_print_usage(argc, argv, params);
66
62
  return 1;
67
63
  }
68
64
 
@@ -79,9 +75,6 @@ int main(int argc, char ** argv) {
79
75
  fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
80
76
 
81
77
  std::mt19937 rng(params.seed);
82
- if (params.random_prompt) {
83
- params.prompt = string_random_prompt(rng);
84
- }
85
78
 
86
79
  llama_backend_init();
87
80
  llama_numa_init(params.numa);
@@ -99,6 +92,12 @@ int main(int argc, char ** argv) {
99
92
  const int n_ctx_train = llama_n_ctx_train(model);
100
93
  const int n_ctx = llama_n_ctx(ctx);
101
94
 
95
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
96
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
97
+ fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
98
+ return 1;
99
+ }
100
+
102
101
  if (n_ctx > n_ctx_train) {
103
102
  fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
104
103
  __func__, n_ctx_train, n_ctx);
@@ -111,7 +110,7 @@ int main(int argc, char ** argv) {
111
110
  }
112
111
 
113
112
  // split the prompt into lines
114
- std::vector<std::string> prompts = split_lines(params.prompt);
113
+ std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
115
114
 
116
115
  // max batch size
117
116
  const uint64_t n_batch = params.n_batch;
@@ -171,7 +170,7 @@ int main(int argc, char ** argv) {
171
170
  // encode if at capacity
172
171
  if (batch.n_tokens + n_toks > n_batch) {
173
172
  float * out = emb + p * n_embd;
174
- batch_decode(ctx, batch, out, s, n_embd);
173
+ batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
175
174
  llama_batch_clear(batch);
176
175
  p += s;
177
176
  s = 0;
@@ -184,29 +183,78 @@ int main(int argc, char ** argv) {
184
183
 
185
184
  // final batch
186
185
  float * out = emb + p * n_embd;
187
- batch_decode(ctx, batch, out, s, n_embd);
188
-
189
- // print the first part of the embeddings or for a single prompt, the full embedding
190
- fprintf(stdout, "\n");
191
- for (int j = 0; j < n_prompts; j++) {
192
- fprintf(stdout, "embedding %d: ", j);
193
- for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
194
- fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
195
- }
196
- fprintf(stdout, "\n");
197
- }
186
+ batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
198
187
 
199
- // print cosine similarity matrix
200
- if (n_prompts > 1) {
188
+ if (params.embd_out.empty()) {
189
+ // print the first part of the embeddings or for a single prompt, the full embedding
201
190
  fprintf(stdout, "\n");
202
- printf("cosine similarity matrix:\n\n");
203
- for (int i = 0; i < n_prompts; i++) {
204
- for (int j = 0; j < n_prompts; j++) {
205
- float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
206
- fprintf(stdout, "%6.2f ", sim);
191
+ for (int j = 0; j < n_prompts; j++) {
192
+ fprintf(stdout, "embedding %d: ", j);
193
+ for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
194
+ if (params.embd_normalize == 0) {
195
+ fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
196
+ } else {
197
+ fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
198
+ }
207
199
  }
208
200
  fprintf(stdout, "\n");
209
201
  }
202
+
203
+ // print cosine similarity matrix
204
+ if (n_prompts > 1) {
205
+ fprintf(stdout, "\n");
206
+ printf("cosine similarity matrix:\n\n");
207
+ for (int i = 0; i < n_prompts; i++) {
208
+ fprintf(stdout, "%6.6s ", prompts[i].c_str());
209
+ }
210
+ fprintf(stdout, "\n");
211
+ for (int i = 0; i < n_prompts; i++) {
212
+ for (int j = 0; j < n_prompts; j++) {
213
+ float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
214
+ fprintf(stdout, "%6.2f ", sim);
215
+ }
216
+ fprintf(stdout, "%1.10s", prompts[i].c_str());
217
+ fprintf(stdout, "\n");
218
+ }
219
+ }
220
+ }
221
+
222
+ if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
223
+ const bool notArray = params.embd_out != "array";
224
+
225
+ fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
226
+ for (int j = 0;;) { // at least one iteration (one prompt)
227
+ if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
228
+ fprintf(stdout, "[");
229
+ for (int i = 0;;) { // at least one iteration (n_embd > 0)
230
+ fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
231
+ i++;
232
+ if (i < n_embd) fprintf(stdout, ","); else break;
233
+ }
234
+ fprintf(stdout, notArray ? "]\n }" : "]");
235
+ j++;
236
+ if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
237
+ }
238
+ fprintf(stdout, notArray ? "\n ]" : "]\n");
239
+
240
+ if (params.embd_out == "json+" && n_prompts > 1) {
241
+ fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
242
+ for (int i = 0;;) { // at least two iteration (n_prompts > 1)
243
+ fprintf(stdout, " [");
244
+ for (int j = 0;;) { // at least two iteration (n_prompts > 1)
245
+ float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
246
+ fprintf(stdout, "%6.2f", sim);
247
+ j++;
248
+ if (j < n_prompts) fprintf(stdout, ", "); else break;
249
+ }
250
+ fprintf(stdout, " ]");
251
+ i++;
252
+ if (i < n_prompts) fprintf(stdout, ",\n"); else break;
253
+ }
254
+ fprintf(stdout, "\n ]");
255
+ }
256
+
257
+ if (notArray) fprintf(stdout, "\n}\n");
210
258
  }
211
259
 
212
260
  // clean up
@@ -1,9 +1,9 @@
1
- set(TARGET eval-callback)
1
+ set(TARGET llama-eval-callback)
2
2
  add_executable(${TARGET} eval-callback.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
6
6
 
7
7
  set(TEST_TARGET test-eval-callback)
8
- add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
8
+ add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
9
9
  set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
62
62
  } else if (type == GGML_TYPE_I8) {
63
63
  v = (float) *(int8_t *) &data[i];
64
64
  } else {
65
- GGML_ASSERT(false);
65
+ GGML_ABORT("fatal error");
66
66
  }
67
67
  printf("%12.4f", v);
68
68
  sum += v;
@@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
99
99
 
100
100
  char src1_str[128] = {0};
101
101
  if (src1) {
102
- sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
102
+ snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
103
103
  }
104
104
 
105
105
  printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
140
140
  }
141
141
 
142
142
  int main(int argc, char ** argv) {
143
-
144
143
  callback_data cb_data;
145
144
 
146
145
  gpt_params params;
146
+
147
147
  if (!gpt_params_parse(argc, argv, params)) {
148
+ gpt_params_print_usage(argc, argv, params);
148
149
  return 1;
149
150
  }
150
151
 
151
152
  print_build_info();
152
153
 
153
154
  std::mt19937 rng(params.seed);
154
- if (params.random_prompt) {
155
- params.prompt = string_random_prompt(rng);
156
- }
157
155
 
158
156
  llama_backend_init();
159
157
  llama_numa_init(params.numa);
@@ -1,4 +1,4 @@
1
- set(TARGET export-lora)
1
+ set(TARGET llama-export-lora)
2
2
  add_executable(${TARGET} export-lora.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})