@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -1,462 +1,420 @@
1
-
2
1
  #include "common.h"
3
2
  #include "ggml.h"
4
3
  #include "ggml-alloc.h"
5
4
 
5
+ #include <map>
6
6
  #include <vector>
7
7
  #include <string>
8
8
  #include <thread>
9
+ #include <fstream>
9
10
 
10
- struct lora_info {
11
- std::string filename;
12
- float scale;
13
- };
14
-
15
- struct export_lora_params {
16
- std::string fn_model_base;
17
- std::string fn_model_out;
18
- std::vector<struct lora_info> lora;
19
- int n_threads;
20
- };
11
+ static bool g_verbose = false;
21
12
 
22
- struct lora_data {
23
- struct lora_info info;
24
- std::vector<uint8_t> data;
25
- struct ggml_context * ctx;
13
+ static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
14
+ int id = gguf_find_key(ctx_gguf, key.c_str());
15
+ return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
16
+ }
26
17
 
27
- uint32_t lora_r;
28
- uint32_t lora_alpha;
29
- };
18
+ static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
19
+ int id = gguf_find_key(ctx_gguf, key.c_str());
20
+ return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
21
+ }
30
22
 
31
- struct llama_file {
32
- // use FILE * so we don't have to re-open the file to mmap
33
- FILE * fp;
34
- size_t size;
23
+ static void zeros(std::ofstream & file, size_t n) {
24
+ char zero = 0;
25
+ for (size_t i = 0; i < n; ++i) {
26
+ file.write(&zero, 1);
27
+ }
28
+ }
35
29
 
36
- llama_file(const char * fname, const char * mode) {
37
- fp = std::fopen(fname, mode);
38
- if (fp == NULL) {
39
- size = 0;
40
- } else {
41
- seek(0, SEEK_END);
42
- size = tell();
43
- seek(0, SEEK_SET);
30
+ static std::string ggml_ne_string(const ggml_tensor * t) {
31
+ std::string str;
32
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
33
+ str += std::to_string(t->ne[i]);
34
+ if (i + 1 < GGML_MAX_DIMS) {
35
+ str += ", ";
44
36
  }
45
37
  }
38
+ return str;
39
+ }
46
40
 
47
- size_t tell() const {
48
- #ifdef _WIN32
49
- __int64 ret = _ftelli64(fp);
50
- #else
51
- long ret = std::ftell(fp);
52
- #endif
53
- GGML_ASSERT(ret != -1); // this really shouldn't fail
54
- return (size_t) ret;
41
+ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
42
+ struct gguf_init_params params = {
43
+ /*.no_alloc = */ true,
44
+ /*.ctx = */ ctx_ggml,
45
+ };
46
+ struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
47
+ if (!ctx_gguf) {
48
+ throw std::runtime_error("failed to load input GGUF from " + fname);
55
49
  }
50
+ return ctx_gguf;
51
+ }
56
52
 
57
- void seek(size_t offset, int whence) {
58
- #ifdef _WIN32
59
- int ret = _fseeki64(fp, (__int64) offset, whence);
60
- #else
61
- int ret = std::fseek(fp, (long) offset, whence);
62
- #endif
63
- GGML_ASSERT(ret == 0); // same
53
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
54
+ std::string result;
55
+ for (size_t pos = 0; ; pos += search.length()) {
56
+ auto new_pos = s.find(search, pos);
57
+ if (new_pos == std::string::npos) {
58
+ result += s.substr(pos, s.size() - pos);
59
+ break;
60
+ }
61
+ result += s.substr(pos, new_pos - pos) + replace;
62
+ pos = new_pos;
64
63
  }
64
+ s = std::move(result);
65
+ }
65
66
 
66
- void read_raw(void * ptr, size_t size) {
67
- if (size == 0) {
68
- return;
69
- }
70
- errno = 0;
71
- std::size_t ret = std::fread(ptr, size, 1, fp);
72
- if (ferror(fp)) {
73
- die_fmt("read error: %s", strerror(errno));
74
- }
75
- if (ret != 1) {
76
- die("unexpectedly reached end of file");
67
+ struct file_input {
68
+ struct ggml_context * ctx_meta = nullptr;
69
+ struct gguf_context * ctx_gguf = nullptr;
70
+ std::ifstream f_in;
71
+ std::map<std::string, ggml_tensor *> tensors;
72
+ float alpha;
73
+ float scale;
74
+
75
+ file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
76
+ if (!f_in.is_open()) {
77
+ throw std::runtime_error("failed to open input gguf from " + fname);
77
78
  }
78
- }
79
79
 
80
- std::uint32_t read_u32() {
81
- std::uint32_t ret;
82
- read_raw(&ret, sizeof(ret));
83
- return ret;
80
+ ctx_gguf = load_gguf(fname, &ctx_meta);
81
+ alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
82
+ printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
83
+
84
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
85
+ std::string name(cur->name);
86
+ tensors[name] = cur;
87
+ if (g_verbose) {
88
+ printf("%s: %s\n", __func__, cur->name);
89
+ }
90
+ }
84
91
  }
85
92
 
86
- std::string read_string(std::uint32_t len) {
87
- std::vector<char> chars(len);
88
- read_raw(chars.data(), len);
89
- return std::string(chars.data(), len);
93
+ ggml_tensor * get_tensor(std::string name) {
94
+ if (tensors.find(name) == tensors.end()) {
95
+ return nullptr;
96
+ }
97
+ return tensors[name];
90
98
  }
91
99
 
92
- void write_raw(const void * ptr, size_t size) {
93
- if (size == 0) {
94
- return;
100
+ void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
101
+ if (tensors.find(name) == tensors.end()) {
102
+ throw std::runtime_error("cannot find tensor with name: " + name);
95
103
  }
96
- errno = 0;
97
- size_t ret = std::fwrite(ptr, size, 1, fp);
98
- if (ret != 1) {
99
- die_fmt("write error: %s", strerror(errno));
104
+ auto len = ggml_nbytes(tensors[name]);
105
+ if (buf.size() < len) {
106
+ buf.resize(len);
100
107
  }
108
+ auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
109
+ auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
110
+ f_in.seekg(offset);
111
+ f_in.read((char* )buf.data(), len);
101
112
  }
102
113
 
103
- void write_u32(std::uint32_t val) {
104
- write_raw(&val, sizeof(val));
114
+ ~file_input() {
115
+ gguf_free(ctx_gguf);
116
+ ggml_free(ctx_meta);
105
117
  }
118
+ };
106
119
 
107
- bool eof() {
108
- return tell() >= size;
109
- }
120
+ struct lora_merge_ctx {
121
+ // input base model + adapters
122
+ file_input base_model;
123
+ std::vector<std::unique_ptr<file_input>> adapters;
110
124
 
111
- ~llama_file() {
112
- if (fp) {
113
- std::fclose(fp);
125
+ // for computing merged tensor
126
+ int n_threads;
127
+ ggml_backend_t backend = nullptr;
128
+ ggml_gallocr_t allocr = nullptr;
129
+ std::vector<uint8_t> read_buf;
130
+
131
+ // output file
132
+ struct gguf_context * ctx_out;
133
+ struct ggml_context * ctx_out_ggml;
134
+ std::ofstream fout;
135
+
136
+ lora_merge_ctx(
137
+ std::string & base_fname,
138
+ std::vector<std::tuple<std::string, float>> & lora_files,
139
+ std::string & outfile,
140
+ int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
141
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
142
+
143
+ if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
144
+ throw std::runtime_error("split model is not yet supported");
114
145
  }
115
- }
116
- };
117
146
 
118
- static struct export_lora_params get_default_export_lora_params() {
119
- struct export_lora_params result;
120
- result.fn_model_base = "";
121
- result.fn_model_out = "";
122
- result.n_threads = GGML_DEFAULT_N_THREADS;
123
- return result;
124
- }
147
+ for (auto lora_inp : lora_files) {
148
+ auto fname = std::get<0>(lora_inp);
149
+ auto scale = std::get<1>(lora_inp);
150
+ std::unique_ptr<file_input> adapter(new file_input(fname, scale));
151
+ check_metadata_lora(adapter.get());
152
+ adapters.push_back(std::move(adapter));
153
+ }
125
154
 
126
- static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
127
- fprintf(stderr, "usage: %s [options]\n", argv[0]);
128
- fprintf(stderr, "\n");
129
- fprintf(stderr, "options:\n");
130
- fprintf(stderr, " -h, --help show this help message and exit\n");
131
- fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
132
- fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
133
- fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
134
- fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
135
- fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
136
- }
155
+ ctx_out = gguf_init_empty();
156
+ struct ggml_init_params params = {
157
+ /*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
158
+ /*.mem_buffer =*/ NULL,
159
+ /*.no_alloc =*/ true,
160
+ };
161
+ ctx_out_ggml = ggml_init(params);
162
+ backend = ggml_backend_cpu_init();
163
+ allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
164
+ }
137
165
 
138
- static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
139
- bool invalid_param = false;
140
- std::string arg;
141
- struct export_lora_params default_params = get_default_export_lora_params();
142
- const std::string arg_prefix = "--";
166
+ void check_metadata_lora(file_input * adapter) {
167
+ auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
168
+ if (general_type != "adapter") {
169
+ throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
170
+ }
143
171
 
144
- for (int i = 1; i < argc; i++) {
145
- arg = argv[i];
146
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
147
- std::replace(arg.begin(), arg.end(), '_', '-');
172
+ auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
173
+ if (adapter_type != "lora") {
174
+ throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
148
175
  }
149
176
 
150
- if (arg == "-m" || arg == "--model-base") {
151
- if (++i >= argc) {
152
- invalid_param = true;
153
- break;
154
- }
155
- params->fn_model_base = argv[i];
156
- } else if (arg == "-o" || arg == "--model-out") {
157
- if (++i >= argc) {
158
- invalid_param = true;
159
- break;
160
- }
161
- params->fn_model_out = argv[i];
162
- } else if (arg == "-l" || arg == "--lora") {
163
- if (++i >= argc) {
164
- invalid_param = true;
165
- break;
166
- }
167
- struct lora_info lora;
168
- lora.filename = argv[i];
169
- lora.scale = 1.0f;
170
- params->lora.push_back(lora);
171
- } else if (arg == "-s" || arg == "--lora-scaled") {
172
- if (++i >= argc) {
173
- invalid_param = true;
174
- break;
175
- }
176
- struct lora_info lora;
177
- lora.filename = argv[i];
178
- if (++i >= argc) {
179
- invalid_param = true;
180
- break;
181
- }
182
- lora.scale = std::stof(argv[i]);
183
- params->lora.push_back(lora);
184
- } else if (arg == "-t" || arg == "--threads") {
185
- if (++i >= argc) {
186
- invalid_param = true;
187
- break;
188
- }
189
- params->n_threads = std::stoi(argv[i]);
190
- if (params->n_threads <= 0) {
191
- params->n_threads = std::thread::hardware_concurrency();
192
- }
193
- } else {
194
- fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
195
- export_lora_print_usage(argc, argv, &default_params);
196
- exit(1);
177
+ auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
178
+ auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
179
+ if (general_arch_base != general_arch_lora) {
180
+ throw std::runtime_error("model arch and LoRA arch mismatch");
197
181
  }
198
182
  }
199
183
 
200
- if (params->fn_model_base == default_params.fn_model_base) {
201
- fprintf(stderr, "error: please specify a filename for model-base.\n");
202
- export_lora_print_usage(argc, argv, &default_params);
203
- exit(1);
204
- }
205
- if (params->fn_model_out == default_params.fn_model_out) {
206
- fprintf(stderr, "error: please specify a filename for model-out.\n");
207
- export_lora_print_usage(argc, argv, &default_params);
208
- exit(1);
209
- }
210
- if (invalid_param) {
211
- fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
212
- export_lora_print_usage(argc, argv, &default_params);
213
- exit(1);
184
+ ggml_type get_out_tensor_type(struct ggml_tensor * t) {
185
+ if (t->type == GGML_TYPE_F32) {
186
+ return GGML_TYPE_F32;
187
+ } else {
188
+ return GGML_TYPE_F16;
189
+ }
214
190
  }
215
- return true;
216
- }
217
191
 
218
- static void free_lora(struct lora_data * lora) {
219
- if (lora->ctx != NULL) {
220
- ggml_free(lora->ctx);
221
- }
222
- delete lora;
223
- }
192
+ void run_merge() {
193
+ // prepare metadata
194
+ gguf_set_kv(ctx_out, base_model.ctx_gguf);
195
+ // output is forced to f16 for now
196
+ gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
197
+
198
+ // check if all lora adapters have the same tensors
199
+ // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
200
+ static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
201
+ if (adapters.size() > 1) {
202
+ for (size_t i = 1; i < adapters.size(); ++i) {
203
+ if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
204
+ throw std::runtime_error(err_no_subset_adapter);
205
+ }
206
+ for (auto & it : adapters[i]->tensors) {
207
+ if (adapters[0]->get_tensor(it.first) == nullptr) {
208
+ throw std::runtime_error(err_no_subset_adapter);
209
+ }
210
+ }
211
+ }
212
+ }
224
213
 
225
- static struct lora_data * load_lora(struct lora_info * info) {
226
- struct lora_data * result = new struct lora_data;
227
- result->info = *info;
228
- result->ctx = NULL;
229
- result->lora_r = 1;
230
- result->lora_alpha = 1;
231
-
232
- struct llama_file file(info->filename.c_str(), "rb");
233
- if (file.fp == NULL) {
234
- fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
235
- info->filename.c_str());
236
- free_lora(result);
237
- return NULL;
238
- }
214
+ // mapping base tensor to out tensor (same shape with base, but different type)
215
+ // if out_tensor == nullptr, we only copy it
216
+ std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
217
+ for (auto & it : base_model.tensors) {
218
+ bool t_a = true;
219
+ bool t_b = true;
220
+ for (auto & adapter : adapters) {
221
+ t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
222
+ t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
223
+ }
224
+ auto base_tensor = it.second;
225
+ if (!t_a && !t_b) {
226
+ // only copy
227
+ struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
228
+ ggml_set_name(cpy_tensor, base_tensor->name);
229
+ base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
230
+ gguf_add_tensor(ctx_out, cpy_tensor);
231
+ } else if (t_a && t_b) {
232
+ // need merging
233
+ struct ggml_tensor * out_tensor = ggml_new_tensor(
234
+ ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
235
+ ggml_set_name(out_tensor, base_tensor->name);
236
+ base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
237
+ gguf_add_tensor(ctx_out, out_tensor);
238
+ } else {
239
+ throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
240
+ }
241
+ }
239
242
 
240
- struct ggml_init_params params_ggml;
241
- params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
242
- params_ggml.mem_buffer = NULL;
243
- params_ggml.no_alloc = true;
244
- result->ctx = ggml_init(params_ggml);
243
+ // placeholder for the meta data
244
+ {
245
+ size_t meta_size = gguf_get_meta_size(ctx_out);
246
+ zeros(fout, meta_size);
247
+ }
245
248
 
246
- uint32_t magic = file.read_u32();
247
- if (magic != LLAMA_FILE_MAGIC_GGLA) {
248
- die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
249
- }
250
- uint32_t version = file.read_u32();
251
- if (version != 1) {
252
- die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
253
- }
254
- result->lora_r = file.read_u32();
255
- result->lora_alpha = file.read_u32();
256
- // read tensor infos from file
257
- std::vector<char> name_buf;
258
- std::vector<struct ggml_tensor *> tensors;
259
- std::vector<size_t> tensors_offset;
260
- size_t total_nbytes_pad = 0;
261
- while(!file.eof()) {
262
- int64_t ne[4] = {1,1,1,1};
263
- uint32_t n_dims = file.read_u32();
264
- uint32_t namelen = file.read_u32();
265
- uint32_t type = file.read_u32();
266
- for (uint32_t k = 0; k < n_dims; ++k) {
267
- ne[k] = (int64_t)file.read_u32();
249
+ // process base model tensors
250
+ size_t n_merged = 0;
251
+ for (auto & it : base_to_out_tensors) {
252
+ if (it.second != nullptr) {
253
+ merge_tensor(it.first, it.second);
254
+ n_merged++;
255
+ } else {
256
+ copy_tensor(it.first);
257
+ }
268
258
  }
269
- name_buf.clear();
270
- name_buf.resize(namelen + 1, '\0');
271
- file.read_raw(name_buf.data(), namelen);
272
- file.seek((0-file.tell()) & 31, SEEK_CUR);
273
- size_t offset = file.tell();
274
- struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
275
- ggml_set_name(tensor, name_buf.data());
276
- size_t nbytes = ggml_nbytes(tensor);
277
- size_t nbytes_pad = ggml_nbytes_pad(tensor);
278
- total_nbytes_pad += nbytes_pad;
279
- tensors.push_back(tensor);
280
- tensors_offset.push_back(offset);
281
- file.seek(nbytes, SEEK_CUR);
282
- }
283
- // read tensor data
284
- result->data.resize(total_nbytes_pad);
285
- size_t data_offset = 0;
286
- for (size_t i = 0; i < tensors.size(); ++i) {
287
- struct ggml_tensor * tensor = tensors[i];
288
- size_t offset = tensors_offset[i];
289
- size_t nbytes = ggml_nbytes(tensor);
290
- size_t nbytes_pad = ggml_nbytes_pad(tensor);
291
- file.seek(offset, SEEK_SET);
292
- tensor->data = result->data.data() + data_offset;
293
- file.read_raw(tensor->data, nbytes);
294
- data_offset += nbytes_pad;
295
- }
296
- return result;
297
- }
298
259
 
260
+ // write output metadata
261
+ {
262
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
263
+ gguf_get_meta_data(ctx_out, data.data());
264
+ fout.seekp(0);
265
+ fout.write((const char *)data.data(), data.size());
266
+ }
299
267
 
300
- static struct ggml_cgraph * build_graph_lora(
301
- struct ggml_context * ctx,
302
- struct ggml_tensor * tensor,
303
- struct ggml_tensor * lora_a,
304
- struct ggml_tensor * lora_b,
305
- float scaling
306
- ) {
307
- struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
308
- if (scaling != 1.0f) {
309
- ab = ggml_scale(ctx, ab, scaling);
268
+ printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
269
+ printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
310
270
  }
311
- struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
312
-
313
- struct ggml_cgraph * gf = ggml_new_graph(ctx);
314
- ggml_build_forward_expand (gf, res);
315
- return gf;
316
- }
317
271
 
318
- static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
319
- if (lora->ctx == NULL) {
320
- return false;
321
- }
322
- std::string name = ggml_get_name(tensor);
323
- std::string name_a = name + std::string(".loraA");
324
- std::string name_b = name + std::string(".loraB");
325
- struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
326
- struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
327
- if (lora_a == NULL || lora_b == NULL) {
328
- return false;
272
+ void copy_tensor(struct ggml_tensor * base) {
273
+ printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
274
+ size_t len = ggml_nbytes(base);
275
+ base_model.read_tensor_data(base->name, read_buf);
276
+ fout.write((char* )read_buf.data(), len);
277
+ zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
329
278
  }
330
279
 
331
- float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
332
-
333
- struct ggml_init_params params;
334
- params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
335
- params.mem_buffer = NULL;
336
- params.no_alloc = true;
337
- struct ggml_context * ctx = NULL;
338
- struct ggml_gallocr * alloc = NULL;
339
- struct ggml_cgraph * gf = NULL;
340
-
341
- ctx = ggml_init(params);
342
- alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
343
- gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
344
-
345
- ggml_gallocr_alloc_graph(alloc, gf);
346
-
347
- struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
348
- static std::vector<uint8_t> data_work;
349
- data_work.resize(cplan.work_size);
350
- cplan.work_data = data_work.data();
280
+ void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
281
+ std::string name_base(base->name);
282
+ std::string name_lora_a = name_base + ".lora_a";
283
+ std::string name_lora_b = name_base + ".lora_b";
284
+
285
+ printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
286
+
287
+ // context for input tensor
288
+ std::vector<struct ggml_tensor *> inp_a(adapters.size());
289
+ std::vector<struct ggml_tensor *> inp_b(adapters.size());
290
+ struct ggml_init_params params {
291
+ /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
292
+ /*.mem_buffer =*/ NULL,
293
+ /*.no_alloc =*/ true,
294
+ };
295
+ struct ggml_context * ctx = ggml_init(params);
296
+
297
+ // alloc tensors
298
+ struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
299
+ for (size_t i = 0; i < adapters.size(); ++i) {
300
+ auto t_a = adapters[i]->get_tensor(name_lora_a);
301
+ auto t_b = adapters[i]->get_tensor(name_lora_b);
302
+ inp_a[i] = ggml_dup_tensor(ctx, t_a);
303
+ inp_b[i] = ggml_dup_tensor(ctx, t_b);
304
+ }
305
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
306
+
307
+ // load base tensor to backend buffer
308
+ base_model.read_tensor_data(name_base, read_buf);
309
+ if (base->type != GGML_TYPE_F32) {
310
+ // optionally dequantize it
311
+ printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
312
+ auto nels = ggml_nelements(inp_base);
313
+ ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
314
+ std::vector<uint8_t> dequant_buf(nels * sizeof(float));
315
+ qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
316
+ ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
317
+ } else {
318
+ ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
319
+ }
351
320
 
352
- ggml_graph_compute(gf, &cplan);
321
+ // load lora tensors to backend buffer
322
+ for (size_t i = 0; i < adapters.size(); ++i) {
323
+ adapters[i]->read_tensor_data(name_lora_a, read_buf);
324
+ ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
325
+ adapters[i]->read_tensor_data(name_lora_b, read_buf);
326
+ ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
327
+ }
353
328
 
354
- ggml_gallocr_free(alloc);
355
- ggml_free(ctx);
356
- return true;
357
- }
329
+ // build graph
330
+ struct ggml_cgraph * gf;
331
+ {
332
+ static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
333
+ static std::vector<uint8_t> buf(buf_size);
334
+ struct ggml_init_params params0 = {
335
+ /*.mem_size =*/ buf_size,
336
+ /*.mem_buffer =*/ buf.data(),
337
+ /*.no_alloc =*/ true,
338
+ };
339
+ struct ggml_context * ctx0 = ggml_init(params0);
340
+ gf = ggml_new_graph(ctx0);
341
+ struct ggml_tensor * cur = inp_base;
342
+ for (size_t i = 0; i < adapters.size(); ++i) {
343
+ struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
344
+ struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
345
+ // scale
346
+ const float alpha = adapters[i]->alpha;
347
+ const float rank = (float) inp_b[i]->ne[0];
348
+ const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
349
+ delta = ggml_scale(ctx0, delta, scale);
350
+ cur = ggml_add(ctx0, delta, cur);
351
+ printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
352
+ printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
353
+ }
354
+ cur = ggml_cast(ctx0, cur, out->type);
355
+ printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
356
+ ggml_build_forward_expand(gf, cur);
357
+ ggml_free(ctx0);
358
+ }
358
359
 
359
- static void export_lora(struct export_lora_params * params) {
360
- // load all loras
361
- std::vector<struct lora_data *> loras;
362
- for (size_t i = 0; i < params->lora.size(); ++i) {
363
- struct lora_data * lora = load_lora(&params->lora[i]);
364
- if (lora != NULL) {
365
- loras.push_back(lora);
360
+ // compute
361
+ {
362
+ ggml_gallocr_alloc_graph(allocr, gf);
363
+ ggml_backend_cpu_set_n_threads(backend, n_threads);
364
+ ggml_backend_graph_compute(backend, gf);
366
365
  }
367
- }
368
- if (loras.size() == 0) {
369
- fprintf(stderr, "warning: no lora adapters will be applied.\n");
370
- }
371
366
 
372
- // open input file
373
- struct llama_file fin(params->fn_model_base.c_str(), "rb");
374
- if (!fin.fp) {
375
- die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
376
- }
367
+ // write data to output file
368
+ {
369
+ auto result = gf->nodes[gf->n_nodes - 1];
370
+ size_t len = ggml_nbytes(result);
371
+ if (read_buf.size() < len) {
372
+ read_buf.resize(len);
373
+ }
374
+ ggml_backend_tensor_get(result, read_buf.data(), 0, len);
375
+ fout.write((char* )read_buf.data(), len);
376
+ zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
377
+ }
377
378
 
378
- // open base model gguf, read tensors without their data
379
- struct ggml_context * ctx_in;
380
- struct gguf_init_params params_gguf;
381
- params_gguf.no_alloc = true;
382
- params_gguf.ctx = &ctx_in;
383
- struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
384
-
385
- // create new gguf
386
- struct gguf_context * gguf_out = gguf_init_empty();
387
-
388
- // copy meta data from base model: kv and tensors
389
- gguf_set_kv(gguf_out, gguf_in);
390
- int n_tensors = gguf_get_n_tensors(gguf_in);
391
- for (int i=0; i < n_tensors; ++i) {
392
- const char * name = gguf_get_tensor_name(gguf_in, i);
393
- struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
394
- gguf_add_tensor(gguf_out, tensor);
379
+ ggml_free(ctx);
380
+ ggml_backend_buffer_free(buffer);
395
381
  }
396
382
 
397
- // create output file
398
- struct llama_file fout(params->fn_model_out.c_str(), "wb");
399
- if (!fout.fp) {
400
- die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
383
+ ~lora_merge_ctx() {
384
+ ggml_gallocr_free(allocr);
385
+ ggml_backend_free(backend);
386
+ gguf_free(ctx_out);
387
+ ggml_free(ctx_out_ggml);
401
388
  }
389
+ };
402
390
 
403
- // write gguf meta data
404
- std::vector<uint8_t> meta;
405
- meta.resize(gguf_get_meta_size(gguf_out));
406
- gguf_get_meta_data(gguf_out, meta.data());
407
- fout.write_raw(meta.data(), meta.size());
408
-
409
- std::vector<uint8_t> data;
410
- std::vector<uint8_t> padding;
411
- for (int i=0; i < n_tensors; ++i) {
412
- const char * name = gguf_get_tensor_name(gguf_in, i);
413
- struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
414
-
415
- // read tensor data
416
- data.resize(ggml_nbytes(tensor));
417
- tensor->data = data.data();
418
- size_t offset = gguf_get_tensor_offset(gguf_in, i);
419
- fin.seek(offset + meta.size(), SEEK_SET);
420
- fin.read_raw(data.data(), data.size());
421
-
422
- // apply all loras
423
- for (size_t k = 0; k < loras.size(); ++k) {
424
- apply_lora(tensor, loras[k], params->n_threads);
425
- }
426
-
427
- // write tensor data + padding
428
- padding.clear();
429
- padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
430
-
431
- GGML_ASSERT(fout.tell() == offset + meta.size());
432
- // fout.seek(offset + meta.size(), SEEK_SET);
433
- fout.write_raw(data.data(), data.size());
434
- fout.write_raw(padding.data(), padding.size());
391
+ static void print_usage(int argc, char ** argv, const gpt_params & params) {
392
+ gpt_params_print_usage(argc, argv, params);
435
393
 
436
- if (i % 2 == 0) {
437
- printf(".");
438
- }
439
- }
394
+ printf("\nexample usage:\n");
395
+ printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
396
+ printf("\nNOTE: output model is F16\n");
440
397
  printf("\n");
441
-
442
- // close gguf
443
- gguf_free(gguf_out);
444
- gguf_free(gguf_in);
445
-
446
- // free loras
447
- for (size_t i = 0; i < loras.size(); ++i) {
448
- free_lora(loras[i]);
449
- }
450
398
  }
451
399
 
452
400
  int main(int argc, char ** argv) {
453
- struct export_lora_params params = get_default_export_lora_params();
401
+ gpt_params params;
454
402
 
455
- if (!export_lora_params_parse(argc, argv, &params)) {
403
+ if (!gpt_params_parse(argc, argv, params)) {
404
+ print_usage(argc, argv, params);
456
405
  return 1;
457
406
  }
458
407
 
459
- export_lora(&params);
408
+ g_verbose = (params.verbosity == 1);
409
+ try {
410
+ lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
411
+ ctx.run_merge();
412
+ } catch (const std::exception & err) {
413
+ fprintf(stderr, "%s\n", err.what());
414
+ exit(EXIT_FAILURE);
415
+ }
416
+
417
+ printf("done, output file is %s\n", params.lora_outfile.c_str());
460
418
 
461
419
  return 0;
462
420
  }