@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,503 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+ #include "ggml.h"
4
+ #include "pca.hpp"
5
+ #include "mean.hpp"
6
+
7
+ #ifdef GGML_USE_CUDA
8
+ #include "ggml-cuda.h"
9
+ #endif
10
+
11
+ #ifdef GGML_USE_METAL
12
+ #include "ggml-metal.h"
13
+ #endif
14
+
15
+ #include <cstdio>
16
+ #include <string>
17
+ #include <tuple>
18
+ #include <vector>
19
+ #include <algorithm>
20
+ #include <iostream>
21
+ #include <fstream>
22
+ #include <climits>
23
+
24
+
25
+ //////////////////////////////////////////////////
26
+ // utils
27
+
28
+ template <class Iter>
29
+ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
30
+ std::string ret;
31
+ for (; begin != end; ++begin) {
32
+ ret += llama_token_to_piece(ctx, *begin);
33
+ }
34
+
35
+ return ret;
36
+ }
37
+
38
+ static void print_usage(int argc, char ** argv, const gpt_params & params) {
39
+ gpt_params_print_usage(argc, argv, params);
40
+
41
+ printf("\nexample usage:\n");
42
+ printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
43
+ printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
44
+ printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
45
+ printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
46
+ printf("\n");
47
+ }
48
+
49
+ //////////////////////////////////////////////////
50
+
51
+
52
+ // cb_eval is reused for each pair of positive - negative prompt
53
+ struct callback_data {
54
+ ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
55
+
56
+ int n_layers = 0;
57
+ int n_tokens = 0;
58
+ bool is_eval_pos = true;
59
+
60
+ // each element of the vector correspond to one layer
61
+ std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
62
+ std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
63
+ std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
64
+
65
+ // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
66
+ void save_tensor_for_layer(struct ggml_tensor * t) {
67
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
68
+
69
+ if (ctx_ggml == nullptr) {
70
+ // alloc a new ctx_ggml if needed
71
+ struct ggml_init_params params_ggml = {
72
+ /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
73
+ /*.mem_buffer =*/ NULL,
74
+ /*.no_alloc =*/ true,
75
+ };
76
+ ctx_ggml = ggml_init(params_ggml);
77
+ }
78
+
79
+ // copy tensor data
80
+ auto n_bytes = ggml_nbytes(t);
81
+ struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
82
+ t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
83
+ ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
84
+ ggml_set_name(t_layer, ggml_get_name(t));
85
+ //print_debug_tensor(t_layer);
86
+
87
+ if (is_eval_pos) {
88
+ v_pos.push_back(t_layer);
89
+ } else {
90
+ v_neg.push_back(t_layer);
91
+ }
92
+ }
93
+
94
+ // calculate diff (v_pos - v_neg) and place the result back to v_pos
95
+ // all zero rows in the diff tensor will also be removed
96
+ // NOTE: final layer is ignored. we only have (n_layers - 1) to process
97
+ std::vector<struct ggml_tensor *> calc_diff() {
98
+ for (float il = 0; il < v_pos.size(); il++) {
99
+ float * a = (float *) v_pos[il]->data;
100
+ float * b = (float *) v_neg[il]->data;
101
+ size_t n_elem = ggml_nelements(v_pos[il]);
102
+ for (size_t j = 0; j < n_elem; j++) {
103
+ a[j] -= b[j];
104
+ }
105
+ //print_debug_tensor(v_pos[i]);
106
+ auto diff_filtered = filter_nonzero_rows(v_pos[il]);
107
+ v_diff_filtered.push_back(diff_filtered);
108
+ }
109
+ return v_diff_filtered; // for convinient, we return the result std::vector
110
+ }
111
+
112
+ // delete zero rows from a given 2D tensor
113
+ struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
114
+ //printf("filter_nonzero_rows\n");
115
+ auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
116
+ // check if given row containing all zero elements
117
+ int n_cols = t->ne[0]; // hint: should be equal to n_embd
118
+ for (int col = 0; col < n_cols; ++col) {
119
+ if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
120
+ return false;
121
+ }
122
+ }
123
+ return true;
124
+ };
125
+ std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
126
+ for (int i_row = 0; i_row < a->ne[1]; i_row++) {
127
+ if (!is_row_all_zeros(a, i_row, 1e-6)) {
128
+ rows_to_copy.push_back(i_row);
129
+ }
130
+ }
131
+
132
+ // get "n_nonzero_rows" for the output "diff_filtered"
133
+ int n_nonzero_rows = rows_to_copy.size();
134
+ //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
135
+ int n_embd = a->ne[0];
136
+ GGML_ASSERT(n_nonzero_rows > 0);
137
+
138
+ // diff_filtered: [n_embd, n_nonzero_rows]
139
+ struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
140
+ ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
141
+ ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
142
+ diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
143
+
144
+ // copy non-zero rows
145
+ for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
146
+ int src_row = rows_to_copy[dest_row];
147
+ for (int i = 0; i < n_embd; i++) {
148
+ float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
149
+ ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
150
+ }
151
+ }
152
+
153
+ //print_debug_tensor(diff_filtered);
154
+
155
+ return diff_filtered;
156
+ }
157
+
158
+ // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
159
+ void reset() {
160
+ for (auto ptr : v_pos) free(ptr->data);
161
+ for (auto ptr : v_neg) free(ptr->data);
162
+ for (auto ptr : v_diff_filtered) free(ptr->data);
163
+ v_pos.clear();
164
+ v_neg.clear();
165
+ v_diff_filtered.clear();
166
+ if (ctx_ggml) {
167
+ ggml_free(ctx_ggml);
168
+ }
169
+ ctx_ggml = nullptr;
170
+ }
171
+ };
172
+
173
+ /**
174
+ * process_ctx is used to store the ggml context for pre-post processing the diff vectors
175
+ * in short, input => v_diff and output => v_final
176
+ */
177
+ struct train_context {
178
+ ggml_context * ctx_ggml;
179
+ int n_embd;
180
+ int n_layers;
181
+
182
+ /* pair of prompts to be used for generating final vector */
183
+ std::vector<std::string> positive_entries;
184
+ std::vector<std::string> negative_entries;
185
+
186
+ // each element of the vector correspond to one layer
187
+ // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
188
+ // NOTE (2): v_diff is transposed from v_diff_tmp
189
+ std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
190
+ std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
191
+
192
+ // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
193
+ // v_diff_tmp will get converted unto v_diff later on
194
+ std::vector<std::vector<uint8_t>> v_diff_tmp;
195
+
196
+ train_context(int n_embd_, int n_layers_) {
197
+ n_embd = n_embd_;
198
+ n_layers = n_layers_;
199
+ struct ggml_init_params params_ggml = {
200
+ /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
201
+ /*.mem_buffer =*/ NULL,
202
+ /*.no_alloc =*/ true,
203
+ };
204
+ ctx_ggml = ggml_init(params_ggml);
205
+ for (int il = 0; il < n_layers - 1; il++) {
206
+ std::vector<uint8_t> empty;
207
+ v_diff_tmp.push_back(empty);
208
+ auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
209
+ t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
210
+ v_final.push_back(t);
211
+ }
212
+ }
213
+
214
+ // add new rows into existing tensor in v_diff_tmp
215
+ void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
216
+ GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
217
+ for (int il = 0; il < n_layers - 1; il++) {
218
+ auto t = diff_filtered[il];
219
+ auto & diff_tmp = v_diff_tmp[il];
220
+ size_t curr_size = diff_tmp.size();
221
+ diff_tmp.resize(curr_size + ggml_nbytes(t));
222
+ memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
223
+ }
224
+ }
225
+
226
+ // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
227
+ // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
228
+ void build_v_diff(bool transpose) {
229
+ printf("build_v_diff\n");
230
+ for (int il = 0; il < n_layers - 1; il++) {
231
+ auto & diff_tmp = v_diff_tmp[il];
232
+ int n_elem = diff_tmp.size() / sizeof(float);
233
+ GGML_ASSERT(n_elem % n_embd == 0);
234
+ int n_rows = n_elem / n_embd;
235
+ struct ggml_tensor * diff = transpose
236
+ ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
237
+ : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
238
+ ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
239
+ diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
240
+ if (transpose) {
241
+ // copy data & transpose
242
+ float * arr = (float *) diff_tmp.data();
243
+ for (int ir = 0; ir < n_rows; ++ir) {
244
+ for (int ic = 0; ic < n_embd; ++ic) {
245
+ float f = arr[ir*n_embd + ic];
246
+ ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
247
+ }
248
+ }
249
+ } else {
250
+ // only copy
251
+ memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
252
+ }
253
+ v_diff.push_back(diff);
254
+ print_debug_tensor(diff);
255
+ // free memory of diff_tmp
256
+ diff_tmp.resize(0);
257
+ }
258
+ }
259
+
260
+ ~train_context() {
261
+ for (auto ptr : v_final) free(ptr->data);
262
+ for (auto ptr : v_diff) free(ptr->data);
263
+ // no need to free v_diff_tmp, since we didn't use malloc
264
+ ggml_free(ctx_ggml);
265
+ }
266
+ };
267
+
268
+ struct tokenized_prompt {
269
+ std::vector<llama_token> tokens_pos;
270
+ std::vector<llama_token> tokens_neg;
271
+ size_t max_seq_len;
272
+
273
+ tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
274
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
275
+ tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
276
+ tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
277
+ max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
278
+ padding_seq(ctx, tokens_pos, max_seq_len);
279
+ padding_seq(ctx, tokens_neg, max_seq_len);
280
+ }
281
+
282
+ void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
283
+ // TODO: customize padding token
284
+ std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
285
+ llama_token pad_tok = pad_tokens.back();
286
+ while (tokens.size() < len) {
287
+ tokens.push_back(pad_tok);
288
+ }
289
+ }
290
+ };
291
+
292
+ //////////////////////////////////////////////////
293
+
294
+ template <typename T>
295
+ static std::string to_string(const T & val) {
296
+ std::stringstream ss;
297
+ ss << val;
298
+ return ss.str();
299
+ }
300
+
301
+ static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
302
+ std::vector<std::string> output;
303
+ std::ifstream file(path);
304
+ if (!file.is_open()) {
305
+ fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
306
+ exit(1);
307
+ }
308
+ std::string line;
309
+ while (std::getline(file, line)) {
310
+ bool is_skip = skip_empty_lines && line.empty();
311
+ if (!is_skip) {
312
+ string_process_escapes(line);
313
+ output.push_back(line);
314
+ }
315
+ }
316
+ file.close();
317
+ return output;
318
+ }
319
+
320
+ //////////////////////////////////////////////////
321
+
322
+ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
323
+ auto * cb_data = (callback_data *) user_data;
324
+ static const char * l_out_name = "l_out";
325
+ const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
326
+
327
+ if (ask) {
328
+ return is_l_out;
329
+ }
330
+
331
+ if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
332
+ return true;
333
+ }
334
+
335
+ // save the tensor to current context
336
+ cb_data->save_tensor_for_layer(t);
337
+ return true;
338
+ }
339
+
340
+ static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
341
+ llama_kv_cache_clear(ctx);
342
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
343
+ fprintf(stderr, "%s : failed to eval\n", __func__);
344
+ return false;
345
+ }
346
+ return true;
347
+ }
348
+
349
+ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
350
+ struct gguf_context * ctx = gguf_init_empty();
351
+
352
+ const std::string arch = "controlvector";
353
+ gguf_set_val_str(ctx, "general.architecture", arch.c_str());
354
+ gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
355
+ gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
356
+
357
+ for (size_t i = 0; i < v_ctrl.size(); ++i) {
358
+ gguf_add_tensor(ctx, v_ctrl[i]);
359
+ print_debug_tensor(v_ctrl[i]);
360
+ printf("Added tensor: %s\n", v_ctrl[i]->name);
361
+ }
362
+
363
+ printf("%s: writing file...\n", __func__);
364
+ gguf_write_to_file(ctx, fname.c_str(), false);
365
+ printf("%s: wrote file '%s'\n", __func__, fname.c_str());
366
+ gguf_free(ctx);
367
+ }
368
+
369
+ /**
370
+ * Load prompt files and completion file.
371
+ * Then format each pair of prompt + completion to make an entry.
372
+ */
373
+ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
374
+ // load prompts
375
+ std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
376
+ std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
377
+ if (positive_prompts.size() != negative_prompts.size()) {
378
+ fprintf(stderr, "number of positive and negative prompts must be equal\n");
379
+ return 1;
380
+ }
381
+ if (positive_prompts.empty()) {
382
+ fprintf(stderr, "must provide at least one prompt pair\n");
383
+ return 1;
384
+ }
385
+ ctx_train.positive_entries = positive_prompts;
386
+ ctx_train.negative_entries = negative_prompts;
387
+ return 0;
388
+ }
389
+
390
+ int main(int argc, char ** argv) {
391
+ gpt_params params;
392
+
393
+ if (!gpt_params_parse(argc, argv, params)) {
394
+ print_usage(argc, argv, params);
395
+ return 1;
396
+ }
397
+
398
+ if (params.n_pca_iterations % params.n_pca_batch != 0) {
399
+ fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
400
+ return 1;
401
+ }
402
+
403
+
404
+ callback_data cb_data;
405
+
406
+ // pass the callback to the backend scheduler
407
+ // it will be executed for each node during the graph computation
408
+ params.cb_eval = cb_eval;
409
+ params.cb_eval_user_data = &cb_data;
410
+ params.warmup = false;
411
+
412
+ print_build_info();
413
+ llama_backend_init();
414
+ llama_numa_init(params.numa);
415
+
416
+ // load the model to get hparams
417
+ llama_model * model;
418
+ llama_context * ctx;
419
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
420
+
421
+ // int n_ctx = llama_n_ctx(ctx);
422
+ int n_layers = llama_n_layer(model);
423
+ int n_embd = llama_n_embd(model);
424
+ // get model hint param (a.k.a model arch name)
425
+ char model_hint[128];
426
+ llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
427
+
428
+ // init train_context
429
+ train_context ctx_train(n_embd, n_layers);
430
+
431
+ // load and prepare entries for training
432
+ prepare_entries(params, ctx_train);
433
+
434
+ // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
435
+ std::vector<tokenized_prompt> tokenized_prompts;
436
+ size_t n_total_tokens = 0;
437
+ for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
438
+ tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
439
+ n_total_tokens += 2 * t.max_seq_len;
440
+ tokenized_prompts.push_back(std::move(t));
441
+ }
442
+
443
+ std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
444
+
445
+ for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
446
+ bool success = false;
447
+ tokenized_prompt t = tokenized_prompts[i];
448
+ cb_data.n_layers = n_layers;
449
+ cb_data.n_tokens = t.max_seq_len;
450
+
451
+ printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
452
+ (int) i+1, (int) ctx_train.positive_entries.size(),
453
+ tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
454
+ tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
455
+ (int) t.max_seq_len);
456
+
457
+ cb_data.is_eval_pos = true;
458
+ success = get_hidden_layers(ctx, t.tokens_pos);
459
+ if (!success) break;
460
+
461
+ cb_data.is_eval_pos = false;
462
+ success = get_hidden_layers(ctx, t.tokens_neg);
463
+ if (!success) break;
464
+
465
+ // calculate diff and remove all zero rows
466
+ auto v_diff_filtered = cb_data.calc_diff();
467
+
468
+ // save & concat the filtered v_diff to ctx_train
469
+ ctx_train.concat_diff_tmp(v_diff_filtered);
470
+
471
+ // reset for next iteration
472
+ cb_data.reset();
473
+ }
474
+
475
+ // done with the model, we can now free it to make gain some memory
476
+ printf("Done evaluate prompts, unload model...\n");
477
+ llama_free(ctx);
478
+ llama_free_model(model);
479
+
480
+ bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
481
+
482
+ // prepare ctx_train for PCA
483
+ ctx_train.build_v_diff(use_pca);
484
+
485
+ if (use_pca) {
486
+ // run PCA
487
+ PCA::pca_params pca_params;
488
+ pca_params.n_threads = params.n_threads;
489
+ pca_params.n_batch = params.n_pca_batch;
490
+ pca_params.n_iterations = params.n_pca_iterations;
491
+ PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
492
+ } else {
493
+ // run mean
494
+ mean::run(ctx_train.v_diff, ctx_train.v_final);
495
+ }
496
+
497
+ // write output vectors to gguf
498
+ export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
499
+
500
+ llama_backend_free();
501
+
502
+ return 0;
503
+ }
@@ -0,0 +1,48 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+ #include "ggml.h"
4
+
5
+ #include <string>
6
+ #include <vector>
7
+ #include <math.h>
8
+
9
+ namespace mean {
10
+
11
+ static void run(
12
+ const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
13
+ const std::vector<struct ggml_tensor *> & v_output) {
14
+ printf("%s: Running mean...\n", __func__);
15
+ for (size_t il = 0; il < v_input.size(); ++il) {
16
+ // prepare output vector
17
+ struct ggml_tensor * ctrl_out = v_output[il];
18
+ ggml_format_name(ctrl_out, "direction.%ld", il+1);
19
+
20
+ // calculate mean vector
21
+ struct ggml_tensor * t_layer = v_input[il];
22
+ GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
23
+ for (int ic = 0; ic < t_layer->ne[0]; ic++) {
24
+ float f = 0.0;
25
+ for (int ir = 0; ir < t_layer->ne[1]; ir++) {
26
+ f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
27
+ }
28
+ f /= t_layer->ne[1];
29
+ ggml_set_f32_1d(ctrl_out, ic, f);
30
+ }
31
+
32
+ // normalize output vector
33
+ float norm = 0.0;
34
+ for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
35
+ float f = ggml_get_f32_1d(ctrl_out, i);
36
+ norm += f*f;
37
+ }
38
+ norm = sqrt(norm);
39
+ for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
40
+ float f = ggml_get_f32_1d(ctrl_out, i);
41
+ ggml_set_f32_1d(ctrl_out, i, f / norm);
42
+ }
43
+
44
+ printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
45
+ }
46
+ }
47
+
48
+ }
@@ -0,0 +1,4 @@
1
+ <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
2
+ <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
3
+ <|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
4
+ <|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow