@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -0,0 +1,281 @@
1
+ #include "llama-model-saver.h"
2
+
3
+ #include "gguf.h"
4
+
5
+ #include "llama.h"
6
+ #include "llama-hparams.h"
7
+ #include "llama-model.h"
8
+ #include "llama-vocab.h"
9
+
10
+ #include <string>
11
+
12
+ llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
13
+ gguf_ctx = gguf_init_empty();
14
+ }
15
+
16
+ llama_model_saver::~llama_model_saver() {
17
+ gguf_free(gguf_ctx);
18
+ }
19
+
20
+ void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
21
+ gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
22
+ }
23
+
24
+ void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
25
+ gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
26
+ }
27
+
28
+ void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
29
+ gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
30
+ }
31
+
32
+ void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
33
+ gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
34
+ }
35
+
36
+ void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
37
+ gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
38
+ }
39
+
40
+ [[noreturn]]
41
+ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
42
+ GGML_UNUSED(key);
43
+ GGML_UNUSED(value);
44
+ GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
45
+ }
46
+
47
+ template <typename Container>
48
+ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
49
+ const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
50
+ GGML_ASSERT(n_values <= value.size());
51
+
52
+ if (n_values == 0) {
53
+ return;
54
+ }
55
+
56
+ if (per_layer) {
57
+ bool all_values_the_same = true;
58
+ for (size_t i = 1; i < n_values; ++i) {
59
+ if (value[i] != value[0]) {
60
+ all_values_the_same = false;
61
+ break;
62
+ }
63
+ }
64
+ if (all_values_the_same) {
65
+ add_kv(key, value[0]);
66
+ return;
67
+ }
68
+ }
69
+
70
+ if (std::is_same<typename Container::value_type, uint8_t>::value) {
71
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
72
+ } else if (std::is_same<typename Container::value_type, int8_t>::value) {
73
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
74
+ } else if (std::is_same<typename Container::value_type, uint32_t>::value) {
75
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
76
+ } else if (std::is_same<typename Container::value_type, int32_t>::value) {
77
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
78
+ } else if (std::is_same<typename Container::value_type, float>::value) {
79
+ gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
80
+ } else if (std::is_same<Container, std::string>::value) {
81
+ gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
82
+ } else {
83
+ GGML_ABORT("fatal error");
84
+ }
85
+ }
86
+
87
+ void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
88
+ std::vector<const char *> tmp(value.size());
89
+ for (size_t i = 0; i < value.size(); ++i) {
90
+ tmp[i] = value[i].c_str();
91
+ }
92
+ gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
93
+ }
94
+
95
+ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
96
+ if (!tensor) {
97
+ return;
98
+ }
99
+ if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
100
+ GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
101
+ return;
102
+ }
103
+ gguf_add_tensor(gguf_ctx, tensor);
104
+ }
105
+
106
+ void llama_model_saver::add_kv_from_model() {
107
+ const llama_hparams & hparams = model.hparams;
108
+ const llama_vocab & vocab = model.vocab;
109
+
110
+ const int32_t n_vocab = vocab.n_tokens();
111
+ std::vector<std::string> tokens(n_vocab);
112
+ std::vector<float> scores(n_vocab);
113
+ std::vector<int32_t> token_types(n_vocab);
114
+
115
+ for (int32_t id = 0; id < n_vocab; ++id) {
116
+ const llama_vocab::token_data & token_data = vocab.get_token_data(id);
117
+
118
+ tokens[id] = token_data.text;
119
+ scores[id] = token_data.score;
120
+
121
+ switch(token_data.attr) {
122
+ case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
123
+ case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
124
+ case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
125
+ case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
126
+ case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
127
+ case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
128
+ case LLAMA_TOKEN_ATTR_UNDEFINED:
129
+ default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
130
+ }
131
+ }
132
+
133
+ // add_kv(LLM_KV_GENERAL_TYPE, ???);
134
+ add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
135
+ // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
136
+ // add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
137
+ add_kv(LLM_KV_GENERAL_NAME, model.name);
138
+ // add_kv(LLM_KV_GENERAL_AUTHOR, ???);
139
+ // add_kv(LLM_KV_GENERAL_VERSION, ???);
140
+ // add_kv(LLM_KV_GENERAL_URL, ???);
141
+ // add_kv(LLM_KV_GENERAL_DESCRIPTION, ???);
142
+ // add_kv(LLM_KV_GENERAL_LICENSE, ???);
143
+ // add_kv(LLM_KV_GENERAL_SOURCE_URL, ???);
144
+ // add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???);
145
+
146
+ add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
147
+ add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
148
+ add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
149
+ add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
150
+ add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
151
+ add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
152
+ add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
153
+ add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
154
+ add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
155
+ // add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
156
+ add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
157
+ add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
158
+ add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
159
+ add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
160
+ add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
161
+ add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
162
+ add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
163
+ add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
164
+ add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
165
+ add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
166
+ add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
167
+ add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
168
+ add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
169
+ add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
170
+ add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
171
+
172
+ add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
173
+ add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
174
+ add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
175
+ add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
176
+ add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
177
+ add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
178
+ add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
179
+ add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
180
+ add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
181
+ add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
182
+ add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
183
+ add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
184
+ add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
185
+ add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
186
+
187
+ const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
188
+
189
+ add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
190
+ add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
191
+ // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
192
+ add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
193
+ add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
194
+ add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor);
195
+ add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
196
+ add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
197
+ add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
198
+
199
+ // TODO: implement split file support
200
+ // add_kv(LLM_KV_SPLIT_NO, ???);
201
+ // add_kv(LLM_KV_SPLIT_COUNT, ???);
202
+ // add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???);
203
+
204
+ add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
205
+ add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
206
+ add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
207
+ add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
208
+ add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
209
+
210
+ add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
211
+
212
+ add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
213
+ add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre());
214
+ add_kv(LLM_KV_TOKENIZER_LIST, tokens);
215
+ add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types);
216
+ add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types());
217
+ add_kv(LLM_KV_TOKENIZER_SCORES, scores);
218
+ add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges());
219
+ // FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
220
+ add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos()));
221
+ add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos()));
222
+ add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot()));
223
+ add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom()));
224
+ add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk()));
225
+ add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep()));
226
+ add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad()));
227
+ // add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated
228
+ // add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
229
+ add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
230
+ add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
231
+ add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
232
+ add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
233
+ add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
234
+ // add_kv(LLM_KV_TOKENIZER_HF_JSON, ???);
235
+ // add_kv(LLM_KV_TOKENIZER_RWKV, ???);
236
+ add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre()));
237
+ add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf()));
238
+ add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid()));
239
+ add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad()));
240
+ add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep()));
241
+ add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep()));
242
+
243
+ // TODO: implement LoRA support
244
+ // add_kv(LLM_KV_ADAPTER_TYPE, ???);
245
+ // add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
246
+
247
+ // deprecated
248
+ // add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
249
+ // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
250
+ // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
251
+ }
252
+
253
+ void llama_model_saver::add_tensors_from_model() {
254
+ if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
255
+ add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
256
+ }
257
+ add_tensor(model.type_embd);
258
+ add_tensor(model.pos_embd);
259
+ add_tensor(model.tok_norm);
260
+ add_tensor(model.tok_norm_b);
261
+ add_tensor(model.output_norm);
262
+ add_tensor(model.output_norm_b);
263
+ add_tensor(model.output);
264
+ add_tensor(model.output_b);
265
+ add_tensor(model.output_norm_enc);
266
+ add_tensor(model.cls);
267
+ add_tensor(model.cls_b);
268
+ add_tensor(model.cls_out);
269
+ add_tensor(model.cls_out_b);
270
+
271
+ for (const struct llama_layer & layer : model.layers) {
272
+ for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
273
+ add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
274
+ }
275
+ }
276
+ }
277
+
278
+ void llama_model_saver::save(const std::string & path_model) {
279
+ gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
280
+ }
281
+
@@ -0,0 +1,37 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-arch.h"
5
+
6
+ #include <vector>
7
+
8
+ struct llama_model_saver {
9
+ struct gguf_context * gguf_ctx = nullptr;
10
+ const struct llama_model & model;
11
+ const struct LLM_KV llm_kv;
12
+
13
+ llama_model_saver(const struct llama_model & model);
14
+ ~llama_model_saver();
15
+
16
+ void add_kv(enum llm_kv key, uint32_t value);
17
+ void add_kv(enum llm_kv key, int32_t value);
18
+ void add_kv(enum llm_kv key, float value);
19
+ void add_kv(enum llm_kv key, bool value);
20
+ void add_kv(enum llm_kv key, const char * value);
21
+
22
+ [[noreturn]]
23
+ void add_kv(enum llm_kv key, char value); // needed to make the template below compile
24
+
25
+ template <typename Container>
26
+ void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
27
+
28
+ void add_kv(enum llm_kv key, const std::vector<std::string> & value);
29
+
30
+ void add_tensor(const struct ggml_tensor * tensor);
31
+
32
+ void add_kv_from_model();
33
+
34
+ void add_tensors_from_model();
35
+
36
+ void save(const std::string & path_model);
37
+ };