@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,3 +1,4 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
2
3
  #include "ggml.h"
3
4
  #include "ggml-alloc.h"
@@ -10,6 +11,12 @@
10
11
 
11
12
  static bool g_verbose = false;
12
13
 
14
+ struct tensor_transformation {
15
+ struct ggml_tensor * in;
16
+ struct ggml_tensor * out;
17
+ bool is_copy;
18
+ };
19
+
13
20
  static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
14
21
  int id = gguf_find_key(ctx_gguf, key.c_str());
15
22
  return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
@@ -50,20 +57,6 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context
50
57
  return ctx_gguf;
51
58
  }
52
59
 
53
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
54
- std::string result;
55
- for (size_t pos = 0; ; pos += search.length()) {
56
- auto new_pos = s.find(search, pos);
57
- if (new_pos == std::string::npos) {
58
- result += s.substr(pos, s.size() - pos);
59
- break;
60
- }
61
- result += s.substr(pos, new_pos - pos) + replace;
62
- pos = new_pos;
63
- }
64
- s = std::move(result);
65
- }
66
-
67
60
  struct file_input {
68
61
  struct ggml_context * ctx_meta = nullptr;
69
62
  struct gguf_context * ctx_gguf = nullptr;
@@ -135,7 +128,7 @@ struct lora_merge_ctx {
135
128
 
136
129
  lora_merge_ctx(
137
130
  std::string & base_fname,
138
- std::vector<std::tuple<std::string, float>> & lora_files,
131
+ std::vector<common_lora_adapter_info> & lora_files,
139
132
  std::string & outfile,
140
133
  int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
141
134
  fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@@ -144,9 +137,9 @@ struct lora_merge_ctx {
144
137
  throw std::runtime_error("split model is not yet supported");
145
138
  }
146
139
 
147
- for (auto lora_inp : lora_files) {
148
- auto fname = std::get<0>(lora_inp);
149
- auto scale = std::get<1>(lora_inp);
140
+ for (auto & lora_inp : lora_files) {
141
+ auto fname = lora_inp.path;
142
+ auto scale = lora_inp.scale;
150
143
  std::unique_ptr<file_input> adapter(new file_input(fname, scale));
151
144
  check_metadata_lora(adapter.get());
152
145
  adapters.push_back(std::move(adapter));
@@ -212,8 +205,7 @@ struct lora_merge_ctx {
212
205
  }
213
206
 
214
207
  // mapping base tensor to out tensor (same shape with base, but different type)
215
- // if out_tensor == nullptr, we only copy it
216
- std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
208
+ std::vector<tensor_transformation> trans;
217
209
  for (auto & it : base_model.tensors) {
218
210
  bool t_a = true;
219
211
  bool t_b = true;
@@ -226,14 +218,22 @@ struct lora_merge_ctx {
226
218
  // only copy
227
219
  struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
228
220
  ggml_set_name(cpy_tensor, base_tensor->name);
229
- base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
221
+ trans.push_back({
222
+ cpy_tensor,
223
+ cpy_tensor,
224
+ true,
225
+ });
230
226
  gguf_add_tensor(ctx_out, cpy_tensor);
231
227
  } else if (t_a && t_b) {
232
228
  // need merging
233
229
  struct ggml_tensor * out_tensor = ggml_new_tensor(
234
230
  ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
235
231
  ggml_set_name(out_tensor, base_tensor->name);
236
- base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
232
+ trans.push_back({
233
+ base_tensor,
234
+ out_tensor,
235
+ false,
236
+ });
237
237
  gguf_add_tensor(ctx_out, out_tensor);
238
238
  } else {
239
239
  throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
@@ -248,12 +248,12 @@ struct lora_merge_ctx {
248
248
 
249
249
  // process base model tensors
250
250
  size_t n_merged = 0;
251
- for (auto & it : base_to_out_tensors) {
252
- if (it.second != nullptr) {
253
- merge_tensor(it.first, it.second);
251
+ for (auto & it : trans) {
252
+ if (!it.is_copy) {
253
+ merge_tensor(it.in, it.out);
254
254
  n_merged++;
255
255
  } else {
256
- copy_tensor(it.first);
256
+ copy_tensor(it.in);
257
257
  }
258
258
  }
259
259
 
@@ -266,7 +266,7 @@ struct lora_merge_ctx {
266
266
  }
267
267
 
268
268
  printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
269
- printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
269
+ printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
270
270
  }
271
271
 
272
272
  void copy_tensor(struct ggml_tensor * base) {
@@ -299,6 +299,10 @@ struct lora_merge_ctx {
299
299
  for (size_t i = 0; i < adapters.size(); ++i) {
300
300
  auto t_a = adapters[i]->get_tensor(name_lora_a);
301
301
  auto t_b = adapters[i]->get_tensor(name_lora_b);
302
+ // TODO: add support for quantized lora
303
+ if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
304
+ throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
305
+ }
302
306
  inp_a[i] = ggml_dup_tensor(ctx, t_a);
303
307
  inp_b[i] = ggml_dup_tensor(ctx, t_b);
304
308
  }
@@ -310,9 +314,9 @@ struct lora_merge_ctx {
310
314
  // optionally dequantize it
311
315
  printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
312
316
  auto nels = ggml_nelements(inp_base);
313
- ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
317
+ const auto * qtype = ggml_get_type_traits(base->type);
314
318
  std::vector<uint8_t> dequant_buf(nels * sizeof(float));
315
- qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
319
+ qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
316
320
  ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
317
321
  } else {
318
322
  ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
@@ -366,7 +370,7 @@ struct lora_merge_ctx {
366
370
 
367
371
  // write data to output file
368
372
  {
369
- auto result = gf->nodes[gf->n_nodes - 1];
373
+ auto * result = ggml_graph_node(gf, -1);
370
374
  size_t len = ggml_nbytes(result);
371
375
  if (read_buf.size() < len) {
372
376
  read_buf.resize(len);
@@ -388,9 +392,7 @@ struct lora_merge_ctx {
388
392
  }
389
393
  };
390
394
 
391
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
392
- gpt_params_print_usage(argc, argv, params);
393
-
395
+ static void print_usage(int, char ** argv) {
394
396
  printf("\nexample usage:\n");
395
397
  printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
396
398
  printf("\nNOTE: output model is F16\n");
@@ -398,16 +400,15 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
398
400
  }
399
401
 
400
402
  int main(int argc, char ** argv) {
401
- gpt_params params;
403
+ common_params params;
402
404
 
403
- if (!gpt_params_parse(argc, argv, params)) {
404
- print_usage(argc, argv, params);
405
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
405
406
  return 1;
406
407
  }
407
408
 
408
- g_verbose = (params.verbosity == 1);
409
+ g_verbose = (params.verbosity > 1);
409
410
  try {
410
- lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
411
+ lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
411
412
  ctx.run_merge();
412
413
  } catch (const std::exception & err) {
413
414
  fprintf(stderr, "%s\n", err.what());
@@ -1,9 +1,5 @@
1
- #define LLAMA_API_INTERNAL
2
-
3
- #include "grammar-parser.h"
4
- #include "ggml.h"
5
- #include "llama.h"
6
1
  #include "unicode.h"
2
+ #include "llama-grammar.h"
7
3
 
8
4
  #include <cstdio>
9
5
  #include <cstdlib>
@@ -12,29 +8,28 @@
12
8
  #include <string>
13
9
  #include <vector>
14
10
 
15
- static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
16
- auto decoded = decode_utf8(input_str, {});
17
- const auto & code_points = decoded.first;
11
+ static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
12
+ const auto cpts = unicode_cpts_from_utf8(input_str);
18
13
 
19
14
  const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
20
- llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
15
+ llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
21
16
 
22
17
  size_t pos = 0;
23
- for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
24
- const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
18
+ for (const auto & cpt : cpts) {
19
+ const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
25
20
 
26
- llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
21
+ llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
27
22
 
28
- if (cur_stacks.empty()) {
23
+ if (stacks_cur.empty()) {
29
24
  error_pos = pos;
30
- error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
31
- cur_stacks = prev_stacks;
25
+ error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
26
+ stacks_cur = stacks_prev;
32
27
  return false;
33
28
  }
34
29
  ++pos;
35
30
  }
36
31
 
37
- for (const auto & stack : cur_stacks) {
32
+ for (const auto & stack : stacks_cur) {
38
33
  if (stack.empty()) {
39
34
  return true;
40
35
  }
@@ -85,27 +80,7 @@ int main(int argc, char** argv) {
85
80
  grammar_str = buffer.str();
86
81
  }
87
82
 
88
- // Parse the GBNF grammar
89
- auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
90
-
91
- // will be empty (default) if there are parse errors
92
- if (parsed_grammar.rules.empty()) {
93
- fprintf(stdout, "%s: failed to parse grammar\n", __func__);
94
- return 1;
95
- }
96
-
97
- // Ensure that there is a "root" node.
98
- if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
99
- fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
100
- return 1;
101
- }
102
-
103
- std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
104
-
105
- // Create the LLAMA grammar
106
- auto grammar = llama_grammar_init(
107
- grammar_rules.data(),
108
- grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
83
+ llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
109
84
  if (grammar == nullptr) {
110
85
  throw std::runtime_error("Failed to initialize llama_grammar");
111
86
  }
@@ -122,7 +97,7 @@ int main(int argc, char** argv) {
122
97
  // Validate the input string against the grammar
123
98
  size_t error_pos;
124
99
  std::string error_msg;
125
- bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
100
+ bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
126
101
 
127
102
  if (is_valid) {
128
103
  fprintf(stdout, "Input string is valid according to the grammar.\n");
@@ -131,7 +106,7 @@ int main(int argc, char** argv) {
131
106
  }
132
107
 
133
108
  // Clean up
134
- llama_grammar_free(grammar);
109
+ llama_grammar_free_impl(grammar);
135
110
 
136
111
  return 0;
137
112
  }
@@ -1,5 +1,5 @@
1
- set(TARGET llama-baby-llama)
2
- add_executable(${TARGET} baby-llama.cpp)
1
+ set(TARGET llama-gen-docs)
2
+ add_executable(${TARGET} gen-docs.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,83 @@
1
+ #include "arg.h"
2
+ #include "common.h"
3
+
4
+ #include <fstream>
5
+ #include <string>
6
+
7
+ // Export usage message (-h) to markdown format
8
+
9
+ static void write_table_header(std::ofstream & file) {
10
+ file << "| Argument | Explanation |\n";
11
+ file << "| -------- | ----------- |\n";
12
+ }
13
+
14
+ static void write_table_entry(std::ofstream & file, const common_arg & opt) {
15
+ file << "| `";
16
+ // args
17
+ for (const auto & arg : opt.args) {
18
+ if (arg == opt.args.front()) {
19
+ file << arg;
20
+ if (opt.args.size() > 1) file << ", ";
21
+ } else {
22
+ file << arg << (arg != opt.args.back() ? ", " : "");
23
+ }
24
+ }
25
+ // value hint
26
+ if (opt.value_hint) {
27
+ std::string md_value_hint(opt.value_hint);
28
+ string_replace_all(md_value_hint, "|", "\\|");
29
+ file << " " << md_value_hint;
30
+ }
31
+ if (opt.value_hint_2) {
32
+ std::string md_value_hint_2(opt.value_hint_2);
33
+ string_replace_all(md_value_hint_2, "|", "\\|");
34
+ file << " " << md_value_hint_2;
35
+ }
36
+ // help text
37
+ std::string md_help(opt.help);
38
+ string_replace_all(md_help, "\n", "<br/>");
39
+ string_replace_all(md_help, "|", "\\|");
40
+ file << "` | " << md_help << " |\n";
41
+ }
42
+
43
+ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
44
+ write_table_header(file);
45
+ for (const auto & opt : opts) {
46
+ write_table_entry(file, *opt);
47
+ }
48
+ }
49
+
50
+ static void export_md(std::string fname, llama_example ex) {
51
+ std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
52
+
53
+ common_params params;
54
+ auto ctx_arg = common_params_parser_init(params, ex);
55
+
56
+ std::vector<common_arg *> common_options;
57
+ std::vector<common_arg *> sparam_options;
58
+ std::vector<common_arg *> specific_options;
59
+ for (auto & opt : ctx_arg.options) {
60
+ // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
61
+ if (opt.is_sparam) {
62
+ sparam_options.push_back(&opt);
63
+ } else if (opt.in_example(ctx_arg.ex)) {
64
+ specific_options.push_back(&opt);
65
+ } else {
66
+ common_options.push_back(&opt);
67
+ }
68
+ }
69
+
70
+ file << "**Common params**\n\n";
71
+ write_table(file, common_options);
72
+ file << "\n\n**Sampling params**\n\n";
73
+ write_table(file, sparam_options);
74
+ file << "\n\n**Example-specific params**\n\n";
75
+ write_table(file, specific_options);
76
+ }
77
+
78
+ int main(int, char **) {
79
+ export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
80
+ export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
81
+
82
+ return 0;
83
+ }
@@ -22,12 +22,20 @@
22
22
  #endif
23
23
 
24
24
  enum split_operation : uint8_t {
25
- SPLIT_OP_SPLIT,
26
- SPLIT_OP_MERGE,
25
+ OP_NONE,
26
+ OP_SPLIT,
27
+ OP_MERGE,
28
+ };
29
+
30
+ enum split_mode : uint8_t {
31
+ MODE_NONE,
32
+ MODE_TENSOR,
33
+ MODE_SIZE,
27
34
  };
28
35
 
29
36
  struct split_params {
30
- split_operation operation = SPLIT_OP_SPLIT;
37
+ split_operation operation = OP_NONE;
38
+ split_mode mode = MODE_NONE;
31
39
  size_t n_bytes_split = 0;
32
40
  int n_split_tensors = 128;
33
41
  std::string input;
@@ -87,59 +95,52 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
87
95
  }
88
96
 
89
97
  bool arg_found = false;
90
- bool is_op_set = false;
91
- bool is_mode_set = false;
92
98
  if (arg == "-h" || arg == "--help") {
93
99
  split_print_usage(argv[0]);
94
100
  exit(0);
95
- }
96
- if (arg == "--version") {
101
+ } else if (arg == "--version") {
97
102
  fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
98
103
  fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
99
104
  exit(0);
100
- }
101
- if (arg == "--dry-run") {
105
+ } else if (arg == "--dry-run") {
102
106
  arg_found = true;
103
107
  params.dry_run = true;
104
- }
105
- if (arg == "--no-tensor-first-split") {
108
+ } else if (arg == "--no-tensor-first-split") {
106
109
  arg_found = true;
107
110
  params.no_tensor_first_split = true;
108
- }
109
-
110
- if (is_op_set) {
111
- throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
112
- }
113
- if (arg == "--merge") {
111
+ } else if (arg == "--merge") {
114
112
  arg_found = true;
115
- is_op_set = true;
116
- params.operation = SPLIT_OP_MERGE;
117
- }
118
- if (arg == "--split") {
113
+ if (params.operation != OP_NONE && params.operation != OP_MERGE) {
114
+ throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
115
+ }
116
+ params.operation = OP_MERGE;
117
+ } else if (arg == "--split") {
119
118
  arg_found = true;
120
- is_op_set = true;
121
- params.operation = SPLIT_OP_SPLIT;
122
- }
123
-
124
- if (is_mode_set) {
125
- throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
126
- }
127
- if (arg == "--split-max-tensors") {
119
+ if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
120
+ throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
121
+ }
122
+ params.operation = OP_SPLIT;
123
+ } else if (arg == "--split-max-tensors") {
128
124
  if (++arg_idx >= argc) {
129
125
  invalid_param = true;
130
126
  break;
131
127
  }
132
128
  arg_found = true;
133
- is_mode_set = true;
129
+ if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
130
+ throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
131
+ }
132
+ params.mode = MODE_TENSOR;
134
133
  params.n_split_tensors = atoi(argv[arg_idx]);
135
- }
136
- if (arg == "--split-max-size") {
134
+ } else if (arg == "--split-max-size") {
137
135
  if (++arg_idx >= argc) {
138
136
  invalid_param = true;
139
137
  break;
140
138
  }
141
139
  arg_found = true;
142
- is_mode_set = true;
140
+ if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
141
+ throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
142
+ }
143
+ params.mode = MODE_SIZE;
143
144
  params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
144
145
  }
145
146
 
@@ -148,11 +149,20 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
148
149
  }
149
150
  }
150
151
 
152
+ // the operation is split if not specified
153
+ if (params.operation == OP_NONE) {
154
+ params.operation = OP_SPLIT;
155
+ }
156
+ // the split mode is by tensor if not specified
157
+ if (params.mode == MODE_NONE) {
158
+ params.mode = MODE_TENSOR;
159
+ }
160
+
151
161
  if (invalid_param) {
152
162
  throw std::invalid_argument("error: invalid parameter for argument: " + arg);
153
163
  }
154
164
 
155
- if (argc - arg_idx < 2) {
165
+ if (argc - arg_idx != 2) {
156
166
  throw std::invalid_argument("error: bad arguments");
157
167
  }
158
168
 
@@ -265,13 +275,15 @@ struct split_strategy {
265
275
  }
266
276
 
267
277
  bool should_split(int i_tensor, size_t next_size) {
268
- if (params.n_bytes_split > 0) {
278
+ if (params.mode == MODE_SIZE) {
269
279
  // split by max size per file
270
280
  return next_size > params.n_bytes_split;
271
- } else {
281
+ } else if (params.mode == MODE_TENSOR) {
272
282
  // split by number of tensors per file
273
283
  return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
274
284
  }
285
+ // should never happen
286
+ GGML_ABORT("invalid mode");
275
287
  }
276
288
 
277
289
  void print_info() {
@@ -389,10 +401,17 @@ static void gguf_merge(const split_params & split_params) {
389
401
  int n_split = 1;
390
402
  int total_tensors = 0;
391
403
 
392
- auto * ctx_out = gguf_init_empty();
404
+ // avoid overwriting existing output file
405
+ if (std::ifstream(split_params.output.c_str())) {
406
+ fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
407
+ exit(EXIT_FAILURE);
408
+ }
409
+
393
410
  std::ofstream fout(split_params.output.c_str(), std::ios::binary);
394
411
  fout.exceptions(std::ofstream::failbit); // fail fast on write errors
395
412
 
413
+ auto * ctx_out = gguf_init_empty();
414
+
396
415
  std::vector<uint8_t> read_data;
397
416
  std::vector<ggml_context *> ctx_metas;
398
417
  std::vector<gguf_context *> ctx_ggufs;
@@ -552,9 +571,9 @@ int main(int argc, const char ** argv) {
552
571
  split_params_parse(argc, argv, params);
553
572
 
554
573
  switch (params.operation) {
555
- case SPLIT_OP_SPLIT: gguf_split(params);
574
+ case OP_SPLIT: gguf_split(params);
556
575
  break;
557
- case SPLIT_OP_MERGE: gguf_merge(params);
576
+ case OP_MERGE: gguf_merge(params);
558
577
  break;
559
578
  default: split_print_usage(argv[0]);
560
579
  exit(EXIT_FAILURE);