@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -0,0 +1,3953 @@
1
+ #include "llama-model.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-mmap.h"
5
+ #include "llama-model-loader.h"
6
+
7
+ #include "ggml-cpp.h"
8
+
9
+ #include <algorithm>
10
+ #include <cassert>
11
+ #include <cstring>
12
+ #include <functional>
13
+ #include <map>
14
+ #include <sstream>
15
+ #include <stdexcept>
16
+
17
+ const char * llm_type_name(llm_type type) {
18
+ switch (type) {
19
+ case LLM_TYPE_14M: return "14M";
20
+ case LLM_TYPE_17M: return "17M";
21
+ case LLM_TYPE_22M: return "22M";
22
+ case LLM_TYPE_33M: return "33M";
23
+ case LLM_TYPE_60M: return "60M";
24
+ case LLM_TYPE_70M: return "70M";
25
+ case LLM_TYPE_80M: return "80M";
26
+ case LLM_TYPE_109M: return "109M";
27
+ case LLM_TYPE_137M: return "137M";
28
+ case LLM_TYPE_160M: return "160M";
29
+ case LLM_TYPE_220M: return "220M";
30
+ case LLM_TYPE_250M: return "250M";
31
+ case LLM_TYPE_270M: return "270M";
32
+ case LLM_TYPE_335M: return "335M";
33
+ case LLM_TYPE_410M: return "410M";
34
+ case LLM_TYPE_450M: return "450M";
35
+ case LLM_TYPE_770M: return "770M";
36
+ case LLM_TYPE_780M: return "780M";
37
+ case LLM_TYPE_0_5B: return "0.5B";
38
+ case LLM_TYPE_1B: return "1B";
39
+ case LLM_TYPE_1_3B: return "1.3B";
40
+ case LLM_TYPE_1_4B: return "1.4B";
41
+ case LLM_TYPE_1_5B: return "1.5B";
42
+ case LLM_TYPE_1_6B: return "1.6B";
43
+ case LLM_TYPE_2B: return "2B";
44
+ case LLM_TYPE_2_8B: return "2.8B";
45
+ case LLM_TYPE_3B: return "3B";
46
+ case LLM_TYPE_4B: return "4B";
47
+ case LLM_TYPE_6B: return "6B";
48
+ case LLM_TYPE_6_9B: return "6.9B";
49
+ case LLM_TYPE_7B: return "7B";
50
+ case LLM_TYPE_8B: return "8B";
51
+ case LLM_TYPE_9B: return "9B";
52
+ case LLM_TYPE_11B: return "11B";
53
+ case LLM_TYPE_12B: return "12B";
54
+ case LLM_TYPE_13B: return "13B";
55
+ case LLM_TYPE_14B: return "14B";
56
+ case LLM_TYPE_15B: return "15B";
57
+ case LLM_TYPE_16B: return "16B";
58
+ case LLM_TYPE_20B: return "20B";
59
+ case LLM_TYPE_30B: return "30B";
60
+ case LLM_TYPE_32B: return "32B";
61
+ case LLM_TYPE_34B: return "34B";
62
+ case LLM_TYPE_35B: return "35B";
63
+ case LLM_TYPE_40B: return "40B";
64
+ case LLM_TYPE_65B: return "65B";
65
+ case LLM_TYPE_70B: return "70B";
66
+ case LLM_TYPE_236B: return "236B";
67
+ case LLM_TYPE_314B: return "314B";
68
+ case LLM_TYPE_671B: return "671B";
69
+ case LLM_TYPE_SMALL: return "0.1B";
70
+ case LLM_TYPE_MEDIUM: return "0.4B";
71
+ case LLM_TYPE_LARGE: return "0.8B";
72
+ case LLM_TYPE_XL: return "1.5B";
73
+ case LLM_TYPE_A1_7B: return "A1.7B";
74
+ case LLM_TYPE_A2_7B: return "A2.7B";
75
+ case LLM_TYPE_8x7B: return "8x7B";
76
+ case LLM_TYPE_8x22B: return "8x22B";
77
+ case LLM_TYPE_16x12B: return "16x12B";
78
+ case LLM_TYPE_16x3_8B: return "16x3.8B";
79
+ case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
80
+ case LLM_TYPE_57B_A14B: return "57B.A14B";
81
+ case LLM_TYPE_27B: return "27B";
82
+ default: return "?B";
83
+ }
84
+ }
85
+
86
+ static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
87
+ switch (type) {
88
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
89
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
90
+ default: return "unknown";
91
+ }
92
+ }
93
+
94
+ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
95
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
96
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
97
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
98
+ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
99
+ };
100
+
101
+ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
102
+ for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
103
+ if (kv.second == name) {
104
+ return (llama_rope_scaling_type) kv.first;
105
+ }
106
+ }
107
+
108
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
109
+ }
110
+
111
+ // checks if the weight tensor can be used with the specified buffer type and device
112
+ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
113
+ GGML_ASSERT(w != nullptr);
114
+
115
+ if (op == GGML_OP_NONE) {
116
+ return true;
117
+ }
118
+
119
+ ggml_init_params params = {
120
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
121
+ /*.mem_buffer =*/ NULL,
122
+ /*.no_alloc =*/ true,
123
+ };
124
+ ggml_context_ptr ctx_ptr { ggml_init(params) };
125
+ if (!ctx_ptr) {
126
+ throw std::runtime_error(format("failed to create ggml context"));
127
+ }
128
+ ggml_context * ctx = ctx_ptr.get();
129
+
130
+ ggml_tensor * op_tensor = nullptr;
131
+
132
+ switch (op) {
133
+ case GGML_OP_GET_ROWS:
134
+ {
135
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
136
+ op_tensor = ggml_get_rows(ctx, w, b);
137
+ } break;
138
+ case GGML_OP_MUL_MAT:
139
+ {
140
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
141
+ op_tensor = ggml_mul_mat(ctx, w, b);
142
+ } break;
143
+ case GGML_OP_MUL_MAT_ID:
144
+ {
145
+ int n_expert_used = hparams.n_expert_used;
146
+ ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
147
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
148
+ op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
149
+ } break;
150
+ case GGML_OP_ADD:
151
+ {
152
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
153
+ op_tensor = ggml_add(ctx, a, w);
154
+ } break;
155
+ case GGML_OP_MUL:
156
+ {
157
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
158
+ op_tensor = ggml_mul(ctx, a, w);
159
+ } break;
160
+ case GGML_OP_DIV:
161
+ {
162
+ ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
163
+ op_tensor = ggml_div(ctx, a, w);
164
+ } break;
165
+ case GGML_OP_ROPE:
166
+ {
167
+ int n_embd_head = hparams.n_embd_head_v;
168
+ int n_head = hparams.n_head();
169
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
170
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
171
+ op_tensor = ggml_rope_ext(
172
+ ctx, a, b, w,
173
+ 0, 0, 0, 0, 0,
174
+ 0, 0, 0, 0
175
+ );
176
+
177
+ } break;
178
+ case GGML_OP_SSM_CONV:
179
+ {
180
+ // FIXME
181
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
182
+ op_tensor = ggml_ssm_conv(ctx, conv_x, w);
183
+ } break;
184
+ case GGML_OP_SSM_SCAN:
185
+ {
186
+ // FIXME
187
+ const int64_t d_state = w->ne[0];
188
+ const int64_t d_inner = w->ne[1];
189
+ const int64_t n_seq_tokens = 512;
190
+ const int64_t n_seqs = 1;
191
+ ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
192
+ ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
193
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
194
+ ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
195
+ ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
196
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
197
+ } break;
198
+ case GGML_OP_RWKV_WKV6:
199
+ {
200
+ // FIXME
201
+ const int64_t S = 123;
202
+ const int64_t H = 123;
203
+ const int64_t n_tokens = 123;
204
+ const int64_t n_seqs = 123;
205
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
206
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
207
+ ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
208
+ ggml_tensor * tf = w;
209
+ ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
210
+ ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
211
+ op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
212
+ } break;
213
+ case GGML_OP_IM2COL:
214
+ {
215
+ const int n_embd = hparams.n_embd;
216
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
217
+ op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
218
+ } break;
219
+ default:
220
+ GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
221
+ }
222
+
223
+ // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
224
+ GGML_ASSERT(w->buffer == nullptr);
225
+ w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
226
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
227
+ ggml_backend_buffer_free(w->buffer);
228
+ w->buffer = nullptr;
229
+
230
+ return op_supported;
231
+ }
232
+
233
+ // lists of buffer types used for each layer
234
+ using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
235
+
236
+ // find the first buffer type in the list that can use the tensor
237
+ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
238
+ GGML_ASSERT(!buft_list.empty());
239
+ for (const auto & cur : buft_list) {
240
+ ggml_backend_dev_t cur_dev = cur.first;
241
+ ggml_backend_buffer_type_t cur_buft = cur.second;
242
+ if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
243
+ return cur_buft;
244
+ }
245
+ }
246
+ return nullptr;
247
+ }
248
+
249
+ // CPU: ACCEL -> CPU extra -> GPU host -> CPU
250
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
251
+ buft_list_t buft_list;
252
+
253
+ // add ACCEL buffer types
254
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
255
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
256
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
257
+ auto * buft = ggml_backend_dev_buffer_type(dev);
258
+ // skip
259
+ if (buft != ggml_backend_cpu_buffer_type()) {
260
+ buft_list.emplace_back(dev, buft);
261
+ }
262
+ }
263
+ }
264
+
265
+ // add extra buffer types
266
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
267
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
268
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
269
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
270
+ if (ggml_backend_dev_get_extra_bufts_fn) {
271
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
272
+ while (extra_bufts && *extra_bufts) {
273
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
274
+ ++extra_bufts;
275
+ }
276
+ }
277
+
278
+ // add a host buffer type
279
+ // storing the tensors in a host buffer is useful when the processing of large batches
280
+ // is offloaded to a GPU device, since it reduces the time spent on data transfers
281
+ // generally, this will be done using the first device in the list
282
+ // a better approach would be to handle this on a weight-by-weight basis using the offload_op
283
+ // function of the device to determine if it would benefit from being stored in a host buffer
284
+ for (auto * dev : devices) {
285
+ ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
286
+ if (buft) {
287
+ buft_list.emplace_back(dev, buft);
288
+ break;
289
+ }
290
+ }
291
+
292
+ // add the CPU buffer type
293
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
294
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
295
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
296
+ buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
297
+ }
298
+ }
299
+
300
+ return buft_list;
301
+ }
302
+
303
+ // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
304
+ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
305
+ buft_list_t buft_list;
306
+
307
+ // add the device split buffer type if requested and available
308
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
309
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
310
+ auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
311
+ ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
312
+ if (ggml_backend_split_buffer_type_fn) {
313
+ size_t dev_index = [&]() {
314
+ auto * reg = ggml_backend_dev_backend_reg(dev);
315
+ for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
316
+ if (ggml_backend_reg_dev_get(reg, i) == dev) {
317
+ return i;
318
+ }
319
+ }
320
+ throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
321
+ }();
322
+ auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
323
+ if (buft != nullptr) {
324
+ buft_list.emplace_back(dev, buft);
325
+ }
326
+ }
327
+ }
328
+
329
+ // add the device default buffer type
330
+ buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
331
+
332
+ return buft_list;
333
+ }
334
+
335
+ struct llama_model::impl {
336
+ impl() {}
337
+ ~impl() {}
338
+
339
+ uint64_t n_elements = 0;
340
+
341
+ size_t n_bytes = 0;
342
+
343
+ std::string desc_str;
344
+
345
+ // model memory mapped files
346
+ llama_mmaps mappings;
347
+
348
+ // objects representing data potentially being locked in memory
349
+ llama_mlocks mlock_bufs;
350
+ llama_mlocks mlock_mmaps;
351
+
352
+ // contexts where the model tensors metadata is stored
353
+ std::vector<ggml_context_ptr> ctxs;
354
+
355
+ // the model memory buffers for the tensor data
356
+ std::vector<ggml_backend_buffer_ptr> bufs;
357
+
358
+ buft_list_t cpu_buft_list;
359
+ std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
360
+
361
+ struct layer_dev {
362
+ ggml_backend_dev_t dev;
363
+ buft_list_t * buft_list;
364
+ };
365
+
366
+ layer_dev dev_input = {};
367
+ layer_dev dev_output = {};
368
+ std::vector<layer_dev> dev_layer;
369
+ };
370
+
371
+ llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
372
+ }
373
+
374
+ llama_model::~llama_model() {}
375
+
376
+ void llama_model::load_stats(llama_model_loader & ml) {
377
+ pimpl->n_elements = ml.n_elements;
378
+ pimpl->n_bytes = ml.n_bytes;
379
+ }
380
+
381
+ void llama_model::load_arch(llama_model_loader & ml) {
382
+ arch = ml.get_arch();
383
+ if (arch == LLM_ARCH_UNKNOWN) {
384
+ throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
385
+ }
386
+ }
387
+
388
+ void llama_model::load_hparams(llama_model_loader & ml) {
389
+ const gguf_context * ctx = ml.meta.get();
390
+
391
+ // get metadata as string
392
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
393
+ enum gguf_type type = gguf_get_kv_type(ctx, i);
394
+ if (type == GGUF_TYPE_ARRAY) {
395
+ continue;
396
+ }
397
+ const char * name = gguf_get_key(ctx, i);
398
+ const std::string value = gguf_kv_to_str(ctx, i);
399
+ gguf_kv.emplace(name, value);
400
+ }
401
+
402
+ // get general kv
403
+ ml.get_key(LLM_KV_GENERAL_NAME, name, false);
404
+
405
+ // everything past this point is not vocab-related
406
+ if (hparams.vocab_only) {
407
+ return;
408
+ }
409
+
410
+ ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
411
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
412
+ ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
413
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
414
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
415
+
416
+ if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
417
+ ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
418
+
419
+ ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
420
+ ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
421
+
422
+ ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
423
+ ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
424
+ }
425
+
426
+ GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
427
+ GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
428
+ if (hparams.n_expert > 0) {
429
+ GGML_ASSERT(hparams.n_expert_used > 0);
430
+ } else {
431
+ GGML_ASSERT(hparams.n_expert_used == 0);
432
+ }
433
+
434
+ // zero-out the array hparams
435
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
436
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
437
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
438
+
439
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
440
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
441
+
442
+ // n_head_kv is optional, default to n_head
443
+ hparams.n_head_kv_arr = hparams.n_head_arr;
444
+
445
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
446
+
447
+ bool rope_finetuned = false;
448
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
449
+ hparams.rope_finetuned = rope_finetuned;
450
+
451
+ hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
452
+ ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
453
+
454
+ // rope_freq_base (optional)
455
+ hparams.rope_freq_base_train = 10000.0f;
456
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
457
+
458
+ std::string rope_scaling("linear");
459
+ ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
460
+ hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
461
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
462
+
463
+ // rope_freq_scale (inverse of the kv) is optional
464
+ float ropescale = 0.0f;
465
+ if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
466
+ // try the old key name
467
+ ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
468
+ }
469
+ hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
470
+
471
+ ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
472
+
473
+ // non-transformer models do not have attention heads
474
+ if (hparams.n_head() > 0) {
475
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
476
+ // gpt-j n_rot = rotary_dim
477
+
478
+ hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
479
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
480
+
481
+ hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
482
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
483
+
484
+ // sanity check for n_rot (optional)
485
+ hparams.n_rot = hparams.n_embd_head_k;
486
+
487
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
488
+
489
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
490
+ if (hparams.n_rot != hparams.n_embd_head_k) {
491
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
492
+ }
493
+ }
494
+ } else {
495
+ hparams.n_rot = 0;
496
+ hparams.n_embd_head_k = 0;
497
+ hparams.n_embd_head_v = 0;
498
+ }
499
+
500
+ // for differentiating model types
501
+ uint32_t n_vocab = 0;
502
+ ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
503
+
504
+ // arch-specific KVs
505
+ switch (arch) {
506
+ case LLM_ARCH_LLAMA:
507
+ {
508
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
509
+
510
+ if (hparams.n_expert == 8) {
511
+ switch (hparams.n_layer) {
512
+ case 32: type = LLM_TYPE_8x7B; break;
513
+ case 56: type = LLM_TYPE_8x22B; break;
514
+ default: type = LLM_TYPE_UNKNOWN;
515
+ }
516
+ } else {
517
+ switch (hparams.n_layer) {
518
+ case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
519
+ case 22: type = LLM_TYPE_1B; break;
520
+ case 26: type = LLM_TYPE_3B; break;
521
+ case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
522
+ // granite uses a vocab with len 49152
523
+ case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
524
+ case 36: type = LLM_TYPE_8B; break; // granite
525
+ case 40: type = LLM_TYPE_13B; break;
526
+ case 48: type = LLM_TYPE_34B; break;
527
+ case 60: type = LLM_TYPE_30B; break;
528
+ case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
529
+ default: type = LLM_TYPE_UNKNOWN;
530
+ }
531
+ }
532
+ } break;
533
+ case LLM_ARCH_DECI:
534
+ {
535
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
536
+ switch (hparams.n_layer) {
537
+ case 32: type = LLM_TYPE_7B; break;
538
+ case 80: type = LLM_TYPE_70B; break;
539
+ default: type = LLM_TYPE_UNKNOWN;
540
+ }
541
+ } break;
542
+ case LLM_ARCH_MINICPM:
543
+ {
544
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
545
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
546
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
547
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
548
+
549
+ switch (hparams.n_layer) {
550
+ case 52: type = LLM_TYPE_1B; break;
551
+ case 40: type = LLM_TYPE_2B; break;
552
+ default: type = LLM_TYPE_UNKNOWN;
553
+ }
554
+ } break;
555
+ case LLM_ARCH_MINICPM3:
556
+ {
557
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
558
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
559
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
560
+
561
+ switch (hparams.n_layer) {
562
+ case 62: type = LLM_TYPE_4B; break;
563
+ default: type = LLM_TYPE_UNKNOWN;
564
+ }
565
+ } break;
566
+ case LLM_ARCH_GROK:
567
+ {
568
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
569
+
570
+ switch (hparams.n_layer) {
571
+ case 64: type = LLM_TYPE_314B; break;
572
+ default: type = LLM_TYPE_UNKNOWN;
573
+ }
574
+ } break;
575
+ case LLM_ARCH_FALCON:
576
+ {
577
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
578
+
579
+ switch (hparams.n_layer) {
580
+ case 32: type = LLM_TYPE_7B; break;
581
+ case 60: type = LLM_TYPE_40B; break;
582
+ default: type = LLM_TYPE_UNKNOWN;
583
+ }
584
+ } break;
585
+ case LLM_ARCH_BAICHUAN:
586
+ {
587
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
588
+ switch (hparams.n_layer) {
589
+ case 32: type = LLM_TYPE_7B; break;
590
+ case 40: type = LLM_TYPE_13B; break;
591
+ default: type = LLM_TYPE_UNKNOWN;
592
+ }
593
+
594
+ if (type == LLM_TYPE_13B) {
595
+ // TODO: become GGUF KV parameter
596
+ hparams.f_max_alibi_bias = 8.0f;
597
+ }
598
+ } break;
599
+ case LLM_ARCH_STARCODER:
600
+ {
601
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
602
+ switch (hparams.n_layer) {
603
+ case 24: type = LLM_TYPE_1B; break;
604
+ case 36: type = LLM_TYPE_3B; break;
605
+ case 42: type = LLM_TYPE_7B; break;
606
+ case 40: type = LLM_TYPE_15B; break;
607
+ default: type = LLM_TYPE_UNKNOWN;
608
+ }
609
+ } break;
610
+ case LLM_ARCH_REFACT:
611
+ {
612
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
613
+ switch (hparams.n_layer) {
614
+ case 32: type = LLM_TYPE_1B; break;
615
+ default: type = LLM_TYPE_UNKNOWN;
616
+ }
617
+
618
+ // TODO: become GGUF KV parameter
619
+ hparams.f_max_alibi_bias = 8.0f;
620
+ } break;
621
+ case LLM_ARCH_BERT:
622
+ {
623
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
624
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
625
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
626
+
627
+ switch (hparams.n_layer) {
628
+ case 3:
629
+ type = LLM_TYPE_17M; break; // bge-micro
630
+ case 6:
631
+ type = LLM_TYPE_22M; break; // MiniLM-L6
632
+ case 12:
633
+ switch (hparams.n_embd) {
634
+ case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
635
+ case 768: type = LLM_TYPE_109M; break; // bge-base
636
+ default: type = LLM_TYPE_UNKNOWN;
637
+ } break;
638
+ case 24:
639
+ type = LLM_TYPE_335M; break; // bge-large
640
+ default: type = LLM_TYPE_UNKNOWN;
641
+ }
642
+ } break;
643
+ case LLM_ARCH_JINA_BERT_V2:
644
+ {
645
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
646
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
647
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
648
+ hparams.f_max_alibi_bias = 8.0f;
649
+
650
+ switch (hparams.n_layer) {
651
+ case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
652
+ case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
653
+ default: type = LLM_TYPE_UNKNOWN;
654
+ }
655
+ } break;
656
+ case LLM_ARCH_NOMIC_BERT:
657
+ {
658
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
659
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
660
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
661
+
662
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
663
+ type = LLM_TYPE_137M;
664
+ }
665
+ } break;
666
+ case LLM_ARCH_BLOOM:
667
+ {
668
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
669
+
670
+ switch (hparams.n_layer) {
671
+ case 24: type = LLM_TYPE_1B; break;
672
+ case 30:
673
+ switch (hparams.n_embd) {
674
+ case 2560: type = LLM_TYPE_3B; break;
675
+ case 4096: type = LLM_TYPE_7B; break;
676
+ default: type = LLM_TYPE_UNKNOWN;
677
+ } break;
678
+ default: type = LLM_TYPE_UNKNOWN;
679
+ }
680
+
681
+ // TODO: become GGUF KV parameter
682
+ hparams.f_max_alibi_bias = 8.0f;
683
+ } break;
684
+ case LLM_ARCH_MPT:
685
+ {
686
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
687
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
688
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
689
+
690
+ switch (hparams.n_layer) {
691
+ case 32: type = LLM_TYPE_7B; break;
692
+ case 48: type = LLM_TYPE_30B; break;
693
+ default: type = LLM_TYPE_UNKNOWN;
694
+ }
695
+ } break;
696
+ case LLM_ARCH_STABLELM:
697
+ {
698
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
699
+
700
+ switch (hparams.n_layer) {
701
+ case 24: type = LLM_TYPE_1B; break;
702
+ case 32: type = LLM_TYPE_3B; break;
703
+ case 40: type = LLM_TYPE_12B; break;
704
+ default: type = LLM_TYPE_UNKNOWN;
705
+ }
706
+ } break;
707
+ case LLM_ARCH_QWEN:
708
+ {
709
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
710
+
711
+ switch (hparams.n_layer) {
712
+ case 32: type = LLM_TYPE_7B; break;
713
+ case 40: type = LLM_TYPE_13B; break;
714
+ default: type = LLM_TYPE_UNKNOWN;
715
+ }
716
+ } break;
717
+ case LLM_ARCH_QWEN2VL:
718
+ {
719
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
720
+ }
721
+ // fall through
722
+ case LLM_ARCH_QWEN2:
723
+ {
724
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
725
+ switch (hparams.n_layer) {
726
+ case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
727
+ case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
728
+ case 32: type = LLM_TYPE_7B; break;
729
+ case 36: type = LLM_TYPE_3B; break;
730
+ case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
731
+ case 48: type = LLM_TYPE_14B; break;
732
+ case 64: type = LLM_TYPE_32B; break;
733
+ case 80: type = LLM_TYPE_70B; break;
734
+ default: type = LLM_TYPE_UNKNOWN;
735
+ }
736
+ } break;
737
+ case LLM_ARCH_QWEN2MOE:
738
+ {
739
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
740
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
741
+
742
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
743
+ switch (hparams.n_layer) {
744
+ case 24: type = LLM_TYPE_A2_7B; break;
745
+ case 28: type = LLM_TYPE_57B_A14B; break;
746
+ default: type = LLM_TYPE_UNKNOWN;
747
+ }
748
+ } break;
749
+ case LLM_ARCH_PHI2:
750
+ {
751
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
752
+
753
+ switch (hparams.n_layer) {
754
+ case 24: type = LLM_TYPE_1B; break;
755
+ case 32: type = LLM_TYPE_3B; break;
756
+ default: type = LLM_TYPE_UNKNOWN;
757
+ }
758
+ } break;
759
+ case LLM_ARCH_PHI3:
760
+ {
761
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
762
+
763
+ switch (hparams.n_layer) {
764
+ case 24: type = LLM_TYPE_1B; break;
765
+ case 32: type = LLM_TYPE_3B; break;
766
+ case 40: type = LLM_TYPE_14B; break;
767
+ default: type = LLM_TYPE_UNKNOWN;
768
+ }
769
+
770
+ // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
771
+ if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
772
+ // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
773
+ hparams.n_swa = 2047;
774
+ } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
775
+ // default value for Phi-3-mini-128k-instruct
776
+ hparams.n_swa = 262144;
777
+ } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
778
+ // default value for Phi-3-medium-128k-instruct
779
+ hparams.n_swa = 131072;
780
+ }
781
+ bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
782
+ if (!found_swa && hparams.n_swa == 0) {
783
+ throw std::runtime_error("invalid value for sliding_window");
784
+ }
785
+ } break;
786
+ case LLM_ARCH_PHIMOE:
787
+ {
788
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
789
+
790
+ switch (hparams.n_layer) {
791
+ case 32: type = LLM_TYPE_16x3_8B; break;
792
+ default: type = LLM_TYPE_UNKNOWN;
793
+ }
794
+ } break;
795
+ case LLM_ARCH_PLAMO:
796
+ {
797
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
798
+
799
+ switch (hparams.n_layer) {
800
+ case 40: type = LLM_TYPE_13B; break;
801
+ default: type = LLM_TYPE_UNKNOWN;
802
+ }
803
+ } break;
804
+ case LLM_ARCH_GPT2:
805
+ {
806
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
807
+ switch (hparams.n_layer) {
808
+ case 12: type = LLM_TYPE_SMALL; break;
809
+ case 24: type = LLM_TYPE_MEDIUM; break;
810
+ case 36: type = LLM_TYPE_LARGE; break;
811
+ case 48: type = LLM_TYPE_XL; break;
812
+ default: type = LLM_TYPE_UNKNOWN;
813
+ }
814
+ } break;
815
+ case LLM_ARCH_CODESHELL:
816
+ {
817
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
818
+ switch (hparams.n_layer) {
819
+ case 42: type = LLM_TYPE_7B; break;
820
+ default: type = LLM_TYPE_UNKNOWN;
821
+ }
822
+ } break;
823
+ case LLM_ARCH_ORION:
824
+ {
825
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
826
+
827
+ switch (hparams.n_layer) {
828
+ case 40: type = LLM_TYPE_14B; break;
829
+ default: type = LLM_TYPE_UNKNOWN;
830
+ }
831
+ } break;
832
+ case LLM_ARCH_INTERNLM2:
833
+ {
834
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
835
+ switch (hparams.n_layer) {
836
+ case 32: type = LLM_TYPE_7B; break;
837
+ case 48: type = LLM_TYPE_20B; break;
838
+ default: type = LLM_TYPE_UNKNOWN;
839
+ }
840
+ } break;
841
+ case LLM_ARCH_GEMMA:
842
+ {
843
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
844
+
845
+ switch (hparams.n_layer) {
846
+ case 18: type = LLM_TYPE_2B; break;
847
+ case 28: type = LLM_TYPE_7B; break;
848
+ default: type = LLM_TYPE_UNKNOWN;
849
+ }
850
+ } break;
851
+ case LLM_ARCH_GEMMA2:
852
+ {
853
+ hparams.n_swa = 4096; // default value of gemma 2
854
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
855
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
856
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
857
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
858
+ hparams.attn_soft_cap = true;
859
+
860
+ switch (hparams.n_layer) {
861
+ case 26: type = LLM_TYPE_2B; break;
862
+ case 42: type = LLM_TYPE_9B; break;
863
+ case 46: type = LLM_TYPE_27B; break;
864
+ default: type = LLM_TYPE_UNKNOWN;
865
+ }
866
+ } break;
867
+ case LLM_ARCH_STARCODER2:
868
+ {
869
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
870
+ switch (hparams.n_layer) {
871
+ case 30: type = LLM_TYPE_3B; break;
872
+ case 32: type = LLM_TYPE_7B; break;
873
+ case 40: type = LLM_TYPE_15B; break;
874
+ case 52: type = LLM_TYPE_20B; break; // granite
875
+ case 88: type = LLM_TYPE_34B; break; // granite
876
+ default: type = LLM_TYPE_UNKNOWN;
877
+ }
878
+ } break;
879
+ case LLM_ARCH_MAMBA:
880
+ {
881
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
882
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
883
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
884
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
885
+ ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
886
+
887
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
888
+
889
+ switch (hparams.n_layer) {
890
+ case 24:
891
+ switch (hparams.n_embd) {
892
+ case 768: type = LLM_TYPE_SMALL; break;
893
+ default: type = LLM_TYPE_UNKNOWN;
894
+ } break;
895
+ case 48:
896
+ switch (hparams.n_embd) {
897
+ case 1024: type = LLM_TYPE_MEDIUM; break;
898
+ case 1536: type = LLM_TYPE_LARGE; break;
899
+ case 2048: type = LLM_TYPE_XL; break;
900
+ default: type = LLM_TYPE_UNKNOWN;
901
+ } break;
902
+ case 64:
903
+ switch (hparams.n_embd) {
904
+ case 2560: type = LLM_TYPE_3B; break;
905
+ default: type = LLM_TYPE_UNKNOWN;
906
+ } break;
907
+ default: type = LLM_TYPE_UNKNOWN;
908
+ }
909
+ } break;
910
+ case LLM_ARCH_XVERSE:
911
+ {
912
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
913
+ switch (hparams.n_layer) {
914
+ case 32: type = LLM_TYPE_7B; break;
915
+ case 40: type = LLM_TYPE_13B; break;
916
+ case 80: type = LLM_TYPE_65B; break;
917
+ default: type = LLM_TYPE_UNKNOWN;
918
+ }
919
+ } break;
920
+ case LLM_ARCH_COMMAND_R:
921
+ {
922
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
923
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
924
+ switch (hparams.n_layer) {
925
+ case 40: type = LLM_TYPE_35B; break;
926
+ default: type = LLM_TYPE_UNKNOWN;
927
+ }
928
+ } break;
929
+ case LLM_ARCH_COHERE2:
930
+ {
931
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
932
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
933
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
934
+ switch (hparams.n_layer) {
935
+ case 32: type = LLM_TYPE_8B; break;
936
+ default: type = LLM_TYPE_UNKNOWN;
937
+ }
938
+ } break;
939
+ case LLM_ARCH_DBRX:
940
+ {
941
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
942
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
943
+
944
+ switch (hparams.n_layer) {
945
+ case 40: type = LLM_TYPE_16x12B; break;
946
+ default: type = LLM_TYPE_UNKNOWN;
947
+ }
948
+ } break;
949
+ case LLM_ARCH_OLMO:
950
+ {
951
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
952
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
953
+
954
+ switch (hparams.n_layer) {
955
+ case 22: type = LLM_TYPE_1B; break;
956
+ case 32: type = LLM_TYPE_7B; break;
957
+ case 80: type = LLM_TYPE_70B; break;
958
+ default: type = LLM_TYPE_UNKNOWN;
959
+ }
960
+ } break;
961
+ case LLM_ARCH_OLMO2:
962
+ {
963
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
964
+
965
+ switch (hparams.n_layer) {
966
+ case 16: type = LLM_TYPE_1B; break;
967
+ case 32: type = LLM_TYPE_7B; break;
968
+ case 40: type = LLM_TYPE_13B; break;
969
+ default: type = LLM_TYPE_UNKNOWN;
970
+ }
971
+ } break;
972
+ case LLM_ARCH_OLMOE:
973
+ {
974
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
975
+ switch (hparams.n_layer) {
976
+ case 16: type = LLM_TYPE_A1_7B; break;
977
+ default: type = LLM_TYPE_UNKNOWN;
978
+ }
979
+ } break;
980
+ case LLM_ARCH_OPENELM:
981
+ {
982
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
983
+
984
+ switch (hparams.n_layer) {
985
+ case 16: type = LLM_TYPE_270M; break;
986
+ case 20: type = LLM_TYPE_450M; break;
987
+ case 28: type = LLM_TYPE_1B; break;
988
+ case 36: type = LLM_TYPE_3B; break;
989
+ default: type = LLM_TYPE_UNKNOWN;
990
+ }
991
+ } break;
992
+ case LLM_ARCH_GPTNEOX:
993
+ {
994
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
995
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
996
+ switch (hparams.n_layer) {
997
+ case 6:
998
+ switch (hparams.n_ff()) {
999
+ case 512: type = LLM_TYPE_14M; break;
1000
+ case 2048: type = LLM_TYPE_70M; break;
1001
+ default: type = LLM_TYPE_UNKNOWN;
1002
+ } break;
1003
+ case 12:
1004
+ switch (hparams.n_ff()) {
1005
+ case 3072: type = LLM_TYPE_160M; break;
1006
+ default: type = LLM_TYPE_UNKNOWN;
1007
+ } break;
1008
+ case 16:
1009
+ switch (hparams.n_ff()) {
1010
+ case 8192: type = LLM_TYPE_1B; break;
1011
+ default: type = LLM_TYPE_UNKNOWN;
1012
+ } break;
1013
+ case 24:
1014
+ switch (hparams.n_ff()) {
1015
+ case 4096: type = LLM_TYPE_410M; break;
1016
+ case 8192: type = LLM_TYPE_1_4B; break;
1017
+ default: type = LLM_TYPE_UNKNOWN;
1018
+ } break;
1019
+ case 32:
1020
+ switch (hparams.n_ff()) {
1021
+ case 10240: type = LLM_TYPE_2_8B; break;
1022
+ case 16384: type = LLM_TYPE_6_9B; break;
1023
+ default: type = LLM_TYPE_UNKNOWN;
1024
+ } break;
1025
+ case 36:
1026
+ switch (hparams.n_ff()) {
1027
+ case 20480: type = LLM_TYPE_12B; break;
1028
+ default: type = LLM_TYPE_UNKNOWN;
1029
+ } break;
1030
+ case 44:
1031
+ switch (hparams.n_ff()) {
1032
+ case 24576: type = LLM_TYPE_20B; break;
1033
+ default: type = LLM_TYPE_UNKNOWN;
1034
+ } break;
1035
+ default: type = LLM_TYPE_UNKNOWN;
1036
+ }
1037
+ } break;
1038
+ case LLM_ARCH_ARCTIC:
1039
+ {
1040
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1041
+
1042
+ if (hparams.n_expert == 128) {
1043
+ switch (hparams.n_layer) {
1044
+ case 35: type = LLM_TYPE_10B_128x3_66B; break;
1045
+ default: type = LLM_TYPE_UNKNOWN;
1046
+ }
1047
+ } else {
1048
+ type = LLM_TYPE_UNKNOWN;
1049
+ }
1050
+ } break;
1051
+ case LLM_ARCH_DEEPSEEK:
1052
+ {
1053
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1054
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1055
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1056
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1057
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1058
+
1059
+ switch (hparams.n_layer) {
1060
+ case 28: type = LLM_TYPE_20B; break;
1061
+ default: type = LLM_TYPE_UNKNOWN;
1062
+ }
1063
+ } break;
1064
+ case LLM_ARCH_DEEPSEEK2:
1065
+ {
1066
+ bool is_lite = (hparams.n_layer == 27);
1067
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1068
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1069
+ if (!is_lite) {
1070
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1071
+ }
1072
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1073
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1074
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1075
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1076
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1077
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1078
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1079
+ // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1080
+ // that have no expert_gating_func model parameter set
1081
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1082
+ }
1083
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
1084
+
1085
+ switch (hparams.n_layer) {
1086
+ case 27: type = LLM_TYPE_16B; break;
1087
+ case 60: type = LLM_TYPE_236B; break;
1088
+ case 61: type = LLM_TYPE_671B; break;
1089
+ default: type = LLM_TYPE_UNKNOWN;
1090
+ }
1091
+ } break;
1092
+ case LLM_ARCH_CHATGLM:
1093
+ {
1094
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1095
+ switch (hparams.n_layer) {
1096
+ case 28: type = LLM_TYPE_6B; break;
1097
+ case 40: type = LLM_TYPE_9B; break;
1098
+ default: type = LLM_TYPE_UNKNOWN;
1099
+ }
1100
+ } break;
1101
+ case LLM_ARCH_BITNET:
1102
+ {
1103
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1104
+
1105
+ switch (hparams.n_layer) {
1106
+ case 26: type = LLM_TYPE_3B; break;
1107
+ default: type = LLM_TYPE_UNKNOWN;
1108
+ }
1109
+ } break;
1110
+ case LLM_ARCH_T5:
1111
+ {
1112
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1113
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1114
+
1115
+ uint32_t dec_start_token_id;
1116
+ if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
1117
+ hparams.dec_start_token_id = dec_start_token_id;
1118
+ }
1119
+
1120
+ switch (hparams.n_layer) {
1121
+ case 6: type = LLM_TYPE_60M; break; // t5-small
1122
+ case 8: type = LLM_TYPE_80M; break; // flan-t5-small
1123
+ case 12:
1124
+ switch (hparams.n_ff()) {
1125
+ case 3072: type = LLM_TYPE_220M; break; // t5-base
1126
+ case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
1127
+ default: type = LLM_TYPE_UNKNOWN;
1128
+ } break;
1129
+ case 24:
1130
+ switch (hparams.n_ff()) {
1131
+ case 4096: type = LLM_TYPE_770M; break; // t5-large
1132
+ case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
1133
+ case 16384: type = LLM_TYPE_3B; break; // t5-3b
1134
+ case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
1135
+ case 65536: type = LLM_TYPE_11B; break; // t5-11b
1136
+ case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
1137
+ default: type = LLM_TYPE_UNKNOWN;
1138
+ } break;
1139
+ default: type = LLM_TYPE_UNKNOWN;
1140
+ }
1141
+ } break;
1142
+ case LLM_ARCH_T5ENCODER:
1143
+ {
1144
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1145
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
1146
+ type = LLM_TYPE_UNKNOWN;
1147
+ } break;
1148
+ case LLM_ARCH_JAIS:
1149
+ {
1150
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1151
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1152
+
1153
+ switch (hparams.n_layer) {
1154
+ case 24: type = LLM_TYPE_1_3B; break;
1155
+ case 40: type = LLM_TYPE_13B; break;
1156
+ /* TODO: add variants */
1157
+ default: type = LLM_TYPE_UNKNOWN;
1158
+ }
1159
+ } break;
1160
+ case LLM_ARCH_NEMOTRON:
1161
+ {
1162
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1163
+ switch (hparams.n_layer) {
1164
+ case 32: type = LLM_TYPE_4B; break;
1165
+ default: type = LLM_TYPE_UNKNOWN;
1166
+ }
1167
+ } break;
1168
+ case LLM_ARCH_EXAONE:
1169
+ {
1170
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1171
+
1172
+ switch (hparams.n_layer) {
1173
+ case 32: type = LLM_TYPE_8B; break;
1174
+ default: type = LLM_TYPE_UNKNOWN;
1175
+ }
1176
+ } break;
1177
+ case LLM_ARCH_RWKV6:
1178
+ case LLM_ARCH_RWKV6QWEN2:
1179
+ {
1180
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
1181
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
1182
+ ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
1183
+ ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
1184
+ ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
1185
+ ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
1186
+ ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
1187
+
1188
+ switch (hparams.n_layer) {
1189
+ case 24: type = LLM_TYPE_1_6B; break;
1190
+ case 32:
1191
+ switch (hparams.n_embd) {
1192
+ case 2560: type = LLM_TYPE_3B; break;
1193
+ case 4096: type = LLM_TYPE_7B; break;
1194
+ default: type = LLM_TYPE_UNKNOWN;
1195
+ } break;
1196
+ case 61: type = LLM_TYPE_14B; break;
1197
+ case 64: type = LLM_TYPE_32B; break;
1198
+ default: type = LLM_TYPE_UNKNOWN;
1199
+ }
1200
+ } break;
1201
+ case LLM_ARCH_GRANITE:
1202
+ case LLM_ARCH_GRANITE_MOE:
1203
+ {
1204
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1205
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1206
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
1207
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
1208
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
1209
+
1210
+ switch (hparams.n_layer) {
1211
+ case 32: type = LLM_TYPE_3B; break;
1212
+ case 40: type = LLM_TYPE_3B; break;
1213
+ // Add additional layer/vocab/etc checks here for other model sizes
1214
+ default: type = LLM_TYPE_UNKNOWN;
1215
+ }
1216
+ } break;
1217
+ case LLM_ARCH_CHAMELEON:
1218
+ {
1219
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1220
+ hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
1221
+ ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
1222
+
1223
+ switch (hparams.n_layer) {
1224
+ case 32: type = LLM_TYPE_7B; break;
1225
+ case 48: type = LLM_TYPE_34B; break;
1226
+ default: type = LLM_TYPE_UNKNOWN;
1227
+ }
1228
+ } break;
1229
+ case LLM_ARCH_WAVTOKENIZER_DEC:
1230
+ {
1231
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1232
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
1233
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
1234
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
1235
+ } break;
1236
+ default: throw std::runtime_error("unsupported model architecture");
1237
+ }
1238
+
1239
+ pimpl->n_bytes = ml.n_bytes;
1240
+
1241
+ pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
1242
+
1243
+ if (hparams.f_max_alibi_bias > 0.0f) {
1244
+ hparams.use_alibi = true;
1245
+ }
1246
+
1247
+ hparams.rope_type = llama_model_rope_type(this);
1248
+ }
1249
+
1250
+ void llama_model::load_vocab(llama_model_loader & ml) {
1251
+ const auto kv = LLM_KV(arch);
1252
+
1253
+ vocab.load(ml, kv);
1254
+ }
1255
+
1256
+ bool llama_model::load_tensors(llama_model_loader & ml) {
1257
+ const auto & split_mode = params.split_mode;
1258
+ const auto & n_gpu_layers = params.n_gpu_layers;
1259
+ const auto & use_mlock = params.use_mlock;
1260
+ const auto & tensor_split = params.tensor_split;
1261
+
1262
+ const int n_layer = hparams.n_layer;
1263
+
1264
+ const bool use_mmap_buffer = true;
1265
+
1266
+ // build a list of buffer types for the CPU and GPU devices
1267
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1268
+ for (auto * dev : devices) {
1269
+ buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
1270
+ // add CPU buffer types as a fallback
1271
+ buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
1272
+ pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
1273
+ }
1274
+
1275
+ // calculate the split points
1276
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
1277
+ std::vector<float> splits(n_devices());
1278
+ if (all_zero) {
1279
+ // default split, by free memory
1280
+ for (size_t i = 0; i < n_devices(); ++i) {
1281
+ ggml_backend_dev_t dev = devices[i];
1282
+ size_t total;
1283
+ size_t free;
1284
+ ggml_backend_dev_memory(dev, &free, &total);
1285
+ splits[i] = free;
1286
+ }
1287
+ } else {
1288
+ std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
1289
+ }
1290
+
1291
+ // sum and normalize the splits to get the split points
1292
+ float split_sum = 0.0f;
1293
+ for (size_t i = 0; i < n_devices(); ++i) {
1294
+ split_sum += splits[i];
1295
+ splits[i] = split_sum;
1296
+ }
1297
+ for (size_t i = 0; i < n_devices(); ++i) {
1298
+ splits[i] /= split_sum;
1299
+ }
1300
+
1301
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1302
+ const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
1303
+ const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1304
+ auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
1305
+ if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
1306
+ return {cpu_dev, &pimpl->cpu_buft_list};
1307
+ }
1308
+ const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
1309
+ auto * dev = devices.at(layer_gpu);
1310
+ return {dev, &pimpl->gpu_buft_list.at(dev)};
1311
+ };
1312
+
1313
+ // assign the input layer
1314
+ // there is very little benefit to offloading the input layer, so always keep it on the CPU
1315
+ pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
1316
+
1317
+ // assign the repeating layers to the devices according to the splits
1318
+ pimpl->dev_layer.resize(n_layer);
1319
+ for (int il = 0; il < n_layer; ++il) {
1320
+ pimpl->dev_layer[il] = get_layer_buft_list(il);
1321
+ }
1322
+
1323
+ // assign the output layer
1324
+ pimpl->dev_output = get_layer_buft_list(n_layer);
1325
+
1326
+ // one ggml context per buffer type
1327
+ int max_n_tensors = ml.n_tensors;
1328
+ max_n_tensors += 1; // duplicated output tensor
1329
+ max_n_tensors += n_layer*2; // duplicated rope freq tensors
1330
+ const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
1331
+
1332
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
1333
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
1334
+ auto it = ctx_map.find(buft);
1335
+ if (it == ctx_map.end()) {
1336
+ ggml_init_params params = {
1337
+ /*.mem_size =*/ ctx_size,
1338
+ /*.mem_buffer =*/ NULL,
1339
+ /*.no_alloc =*/ true,
1340
+ };
1341
+
1342
+ ggml_context * ctx = ggml_init(params);
1343
+ if (!ctx) {
1344
+ throw std::runtime_error(format("failed to create ggml context"));
1345
+ }
1346
+
1347
+ ctx_map[buft] = ctx;
1348
+ pimpl->ctxs.emplace_back(ctx);
1349
+
1350
+ return ctx;
1351
+ }
1352
+ return it->second;
1353
+ };
1354
+
1355
+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
1356
+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
1357
+
1358
+ // create tensors for the weights
1359
+ {
1360
+ // note: cast to int64_t since we will use these for the tensor dimensions
1361
+ const int64_t n_head = hparams.n_head();
1362
+ const int64_t n_head_kv = hparams.n_head_kv();
1363
+ const int64_t n_embd = hparams.n_embd;
1364
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1365
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1366
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
1367
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
1368
+ const int64_t n_ff = hparams.n_ff();
1369
+ const int64_t n_embd_gqa = n_embd_v_gqa;
1370
+ const int64_t n_vocab = vocab.n_tokens();
1371
+ const int64_t n_token_types = vocab.n_token_types();
1372
+ const int64_t n_rot = hparams.n_rot;
1373
+ const int64_t n_expert = hparams.n_expert;
1374
+ const int64_t n_expert_used = hparams.n_expert_used;
1375
+ const int64_t n_ctx_train = hparams.n_ctx_train;
1376
+
1377
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
1378
+ throw std::runtime_error("model has expert layers but no expert layers are used");
1379
+ }
1380
+
1381
+ int n_moved_tensors = 0;
1382
+ ggml_tensor * first_moved_tensor = nullptr;
1383
+ ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
1384
+ ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
1385
+
1386
+ auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
1387
+ ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
1388
+
1389
+ if (!t_meta) {
1390
+ if (flags & TENSOR_NOT_REQUIRED) {
1391
+ return nullptr;
1392
+ }
1393
+ throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
1394
+ }
1395
+
1396
+ // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
1397
+ // the tensor is duplicated
1398
+ // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
1399
+ llm_tensor tn_tensor = tn.tensor;
1400
+ if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
1401
+ tn_tensor = LLM_TENSOR_OUTPUT;
1402
+ }
1403
+
1404
+ llm_tensor_info info;
1405
+ try {
1406
+ info = llm_tensor_info_for(tn_tensor);
1407
+ } catch (const std::out_of_range & e) {
1408
+ throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
1409
+ }
1410
+
1411
+ // tensors with "bias" suffix are always used with GGML_OP_ADD
1412
+ ggml_op op;
1413
+ bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
1414
+ if (bias) {
1415
+ op = GGML_OP_ADD;
1416
+ } else {
1417
+ op = info.op;
1418
+ }
1419
+
1420
+ // sanity checks
1421
+ if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
1422
+ if (tn.bid != -1) {
1423
+ GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
1424
+ }
1425
+ } else {
1426
+ if (tn.bid == -1) {
1427
+ GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
1428
+ }
1429
+ }
1430
+
1431
+ // select the buffer type for this tensor
1432
+ buft_list_t * buft_list;
1433
+ switch (info.layer) {
1434
+ case LLM_TENSOR_LAYER_INPUT:
1435
+ buft_list = pimpl->dev_input.buft_list;
1436
+ break;
1437
+ case LLM_TENSOR_LAYER_OUTPUT:
1438
+ buft_list = pimpl->dev_output.buft_list;
1439
+ break;
1440
+ case LLM_TENSOR_LAYER_REPEATING:
1441
+ buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
1442
+ break;
1443
+ default:
1444
+ GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
1445
+ }
1446
+
1447
+ ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1448
+ if (!buft) {
1449
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1450
+ }
1451
+
1452
+ // avoid using a host buffer when using mmap
1453
+ auto * buft_dev = ggml_backend_buft_get_device(buft);
1454
+ if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
1455
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1456
+ buft = ggml_backend_dev_buffer_type(cpu_dev);
1457
+ }
1458
+
1459
+ if (buft != buft_list->front().second) {
1460
+ n_moved_tensors++;
1461
+ if (!first_moved_tensor) {
1462
+ first_moved_tensor = t_meta;
1463
+ first_moved_from_buft = buft_list->front().second;
1464
+ first_moved_to_buft = buft;
1465
+ }
1466
+ }
1467
+
1468
+ ggml_context * ctx = ctx_for_buft(buft);
1469
+
1470
+ // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
1471
+ if (flags & TENSOR_DUPLICATED) {
1472
+ ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
1473
+ if (t) {
1474
+ return t;
1475
+ }
1476
+ }
1477
+ return ml.create_tensor(ctx, tn, ne, flags);
1478
+ };
1479
+
1480
+ layers.resize(n_layer);
1481
+
1482
+ // TODO: move to a separate function
1483
+ const auto tn = LLM_TN(arch);
1484
+ switch (arch) {
1485
+ case LLM_ARCH_LLAMA:
1486
+ case LLM_ARCH_REFACT:
1487
+ case LLM_ARCH_MINICPM:
1488
+ case LLM_ARCH_GRANITE:
1489
+ case LLM_ARCH_GRANITE_MOE:
1490
+ {
1491
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1492
+
1493
+ // output
1494
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1495
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1496
+
1497
+ // if output is NULL, init from the input tok embed
1498
+ if (output == NULL) {
1499
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1500
+ }
1501
+
1502
+ for (int i = 0; i < n_layer; ++i) {
1503
+ auto & layer = layers[i];
1504
+
1505
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1506
+
1507
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1508
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1509
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1510
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1511
+
1512
+ // optional bias tensors
1513
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1514
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1515
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1516
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1517
+
1518
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1519
+
1520
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
1521
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1522
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1523
+ }
1524
+ else {
1525
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1526
+ }
1527
+
1528
+ if (n_expert == 0) {
1529
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1530
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1531
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1532
+
1533
+ // optional MLP bias
1534
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
1535
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1536
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
1537
+ } else {
1538
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1539
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1540
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1541
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1542
+ }
1543
+ }
1544
+ } break;
1545
+ case LLM_ARCH_DECI:
1546
+ {
1547
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1548
+
1549
+ // output
1550
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1551
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1552
+
1553
+ // if output is NULL, init from the input tok embed
1554
+ if (output == NULL) {
1555
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1556
+ }
1557
+
1558
+ for (int i = 0; i < n_layer; ++i) {
1559
+ auto & layer = layers[i];
1560
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
1561
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
1562
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
1563
+ const int64_t n_ff = hparams.n_ff(i);
1564
+ const int64_t n_head = hparams.n_head(i);
1565
+ const int64_t n_head_kv = hparams.n_head_kv(i);
1566
+
1567
+ if (n_head_kv == 0 && n_head > 0) {
1568
+ // linear attention for DeciLMCausalModel
1569
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1570
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1571
+ }
1572
+ else if (n_head_kv > 0) {
1573
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1574
+
1575
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1576
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1577
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1578
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1579
+ }
1580
+
1581
+ // optional bias tensors
1582
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1583
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1584
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1585
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1586
+
1587
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1588
+
1589
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
1590
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1591
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1592
+ }
1593
+ else {
1594
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1595
+ }
1596
+
1597
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1598
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1599
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1600
+
1601
+ // optional MLP bias
1602
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
1603
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1604
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
1605
+ }
1606
+ } break;
1607
+ case LLM_ARCH_MINICPM3:
1608
+ {
1609
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
1610
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
1611
+
1612
+ const int64_t q_lora_rank = hparams.n_lora_q;
1613
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
1614
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1615
+
1616
+ // output
1617
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1618
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1619
+
1620
+ // if output is NULL, init from the input tok embed
1621
+ if (output == NULL) {
1622
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1623
+ }
1624
+
1625
+ for (int i = 0; i < n_layer; ++i) {
1626
+ auto & layer = layers[i];
1627
+
1628
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1629
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
1630
+
1631
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
1632
+
1633
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
1634
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
1635
+
1636
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
1637
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
1638
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
1639
+
1640
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1641
+
1642
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1643
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1644
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1645
+
1646
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1647
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1648
+ }
1649
+ } break;
1650
+ case LLM_ARCH_GROK:
1651
+ {
1652
+ if (n_expert == 0) {
1653
+ throw std::runtime_error("Grok model cannot have zero experts");
1654
+ }
1655
+
1656
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1657
+
1658
+ // output
1659
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1660
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1661
+
1662
+ // if output is NULL, init from the input tok embed
1663
+ if (output == NULL) {
1664
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1665
+ }
1666
+
1667
+ for (int i = 0; i < n_layer; ++i) {
1668
+ auto & layer = layers[i];
1669
+
1670
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1671
+
1672
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1673
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1674
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1675
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1676
+
1677
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
1678
+
1679
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1680
+
1681
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1682
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1683
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1684
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1685
+
1686
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
1687
+ }
1688
+ } break;
1689
+ case LLM_ARCH_DBRX:
1690
+ {
1691
+ if (n_expert == 0) {
1692
+ throw std::runtime_error("DBRX model cannot have zero experts");
1693
+ }
1694
+
1695
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1696
+
1697
+ // output
1698
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1699
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1700
+
1701
+ for (int i = 0; i < n_layer; ++i) {
1702
+ auto & layer = layers[i];
1703
+
1704
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1705
+
1706
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1707
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1708
+
1709
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
1710
+
1711
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1712
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1713
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
1714
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1715
+ }
1716
+ } break;
1717
+ case LLM_ARCH_BAICHUAN:
1718
+ {
1719
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1720
+ {
1721
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1722
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1723
+ }
1724
+
1725
+ for (int i = 0; i < n_layer; ++i) {
1726
+ auto & layer = layers[i];
1727
+
1728
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1729
+
1730
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1731
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1732
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1733
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1734
+
1735
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1736
+
1737
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1738
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1739
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1740
+ }
1741
+ } break;
1742
+ case LLM_ARCH_FALCON:
1743
+ {
1744
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1745
+
1746
+ // output
1747
+ {
1748
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1749
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1750
+
1751
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1752
+ if (!output) {
1753
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
1754
+ }
1755
+ }
1756
+
1757
+ for (int i = 0; i < n_layer; ++i) {
1758
+ auto & layer = layers[i];
1759
+
1760
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1761
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1762
+
1763
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
1764
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1765
+
1766
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1767
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1768
+
1769
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1770
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1771
+ }
1772
+ } break;
1773
+ case LLM_ARCH_STARCODER:
1774
+ {
1775
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1776
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
1777
+
1778
+ // output
1779
+ {
1780
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1781
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1782
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1783
+ if (!output) {
1784
+ // needs to be on GPU
1785
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1786
+ }
1787
+
1788
+ }
1789
+
1790
+ for (int i = 0; i < n_layer; ++i) {
1791
+ auto & layer = layers[i];
1792
+
1793
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1794
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1795
+
1796
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1797
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
1798
+
1799
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1800
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1801
+
1802
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1803
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
1804
+
1805
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1806
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1807
+
1808
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1809
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1810
+ }
1811
+ } break;
1812
+ case LLM_ARCH_BERT:
1813
+ case LLM_ARCH_NOMIC_BERT:
1814
+ {
1815
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1816
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
1817
+
1818
+ if (arch == LLM_ARCH_BERT) {
1819
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
1820
+
1821
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
1822
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
1823
+
1824
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
1825
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
1826
+ }
1827
+
1828
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
1829
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
1830
+
1831
+ for (int i = 0; i < n_layer; ++i) {
1832
+ auto & layer = layers[i];
1833
+
1834
+ if (arch == LLM_ARCH_BERT) {
1835
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1836
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
1837
+
1838
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1839
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
1840
+
1841
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1842
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
1843
+ } else {
1844
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1845
+ }
1846
+
1847
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1848
+
1849
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
1850
+ layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
1851
+
1852
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1853
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1854
+
1855
+ if (arch == LLM_ARCH_BERT) {
1856
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1857
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1858
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1859
+ } else {
1860
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1861
+ }
1862
+
1863
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
1864
+ layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
1865
+ }
1866
+ } break;
1867
+ case LLM_ARCH_JINA_BERT_V2:
1868
+ {
1869
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
1870
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
1871
+
1872
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
1873
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
1874
+
1875
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
1876
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
1877
+ for (int i = 0; i < n_layer; ++i) {
1878
+ auto & layer = layers[i]; // JinaBertLayer
1879
+
1880
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1881
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
1882
+
1883
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
1884
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1885
+
1886
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1887
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
1888
+
1889
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
1890
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1891
+
1892
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1893
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
1894
+
1895
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
1896
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
1897
+
1898
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
1899
+ layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
1900
+
1901
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
1902
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1903
+
1904
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1905
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1906
+
1907
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1908
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1909
+
1910
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
1911
+ layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
1912
+ }
1913
+ } break;
1914
+ case LLM_ARCH_BLOOM:
1915
+ {
1916
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1917
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
1918
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
1919
+
1920
+ // output
1921
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1922
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1923
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1924
+
1925
+ for (int i = 0; i < n_layer; ++i) {
1926
+ auto & layer = layers[i];
1927
+
1928
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1929
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1930
+
1931
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1932
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
1933
+
1934
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1935
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1936
+
1937
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1938
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
1939
+
1940
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1941
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1942
+
1943
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1944
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1945
+ }
1946
+ } break;
1947
+ case LLM_ARCH_MPT:
1948
+ {
1949
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1950
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
1951
+
1952
+ // output
1953
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1954
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
1955
+
1956
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1957
+ if (!output) {
1958
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
1959
+ }
1960
+
1961
+ for (int i = 0; i < n_layer; ++i) {
1962
+ auto & layer = layers[i];
1963
+
1964
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1965
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1966
+
1967
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1968
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
1969
+
1970
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1971
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1972
+
1973
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1974
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1975
+
1976
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1977
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1978
+
1979
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1980
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
1981
+
1982
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
1983
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1984
+
1985
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
1986
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1987
+
1988
+ // AWQ ScaleActivation layer
1989
+ layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
1990
+ }
1991
+ } break;
1992
+ case LLM_ARCH_STABLELM:
1993
+ {
1994
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1995
+
1996
+ // output
1997
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1998
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1999
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2000
+
2001
+ for (int i = 0; i < n_layer; ++i) {
2002
+ auto & layer = layers[i];
2003
+
2004
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2005
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2006
+
2007
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2008
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2009
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2010
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2011
+
2012
+ // optional bias tensors, present in Stable LM 2 1.6B
2013
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2014
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2015
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2016
+
2017
+ // optional q and k layernorms, present in StableLM 2 12B
2018
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
2019
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
2020
+
2021
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
2022
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2023
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2024
+
2025
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2026
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2027
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2028
+ }
2029
+ } break;
2030
+ case LLM_ARCH_QWEN:
2031
+ {
2032
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2033
+
2034
+ // output
2035
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2036
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2037
+
2038
+ for (int i = 0; i < n_layer; ++i) {
2039
+ auto & layer = layers[i];
2040
+
2041
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2042
+
2043
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
2044
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
2045
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2046
+
2047
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2048
+
2049
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
2050
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
2051
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
2052
+ }
2053
+ } break;
2054
+ case LLM_ARCH_QWEN2:
2055
+ case LLM_ARCH_QWEN2VL:
2056
+ {
2057
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2058
+
2059
+ // output
2060
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2061
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2062
+ // if output is NULL, init from the input tok embed
2063
+ if (output == NULL) {
2064
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2065
+ }
2066
+
2067
+ for (int i = 0; i < n_layer; ++i) {
2068
+ auto & layer = layers[i];
2069
+
2070
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2071
+
2072
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2073
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2074
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2075
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2076
+
2077
+ // optional bias tensors
2078
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2079
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
2080
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2081
+
2082
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2083
+
2084
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2085
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2086
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2087
+ }
2088
+ } break;
2089
+ case LLM_ARCH_QWEN2MOE:
2090
+ {
2091
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2092
+
2093
+ // output
2094
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2095
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2096
+
2097
+ for (int i = 0; i < n_layer; ++i) {
2098
+ auto & layer = layers[i];
2099
+
2100
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2101
+
2102
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2103
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2104
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2105
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2106
+
2107
+ // optional bias tensors
2108
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2109
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
2110
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2111
+
2112
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2113
+
2114
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2115
+
2116
+ if (n_expert == 0) {
2117
+ throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
2118
+ }
2119
+ if (n_expert_used == 0) {
2120
+ throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
2121
+ }
2122
+
2123
+ // MoE branch
2124
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2125
+
2126
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2127
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2128
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2129
+
2130
+ // Shared expert branch
2131
+ const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
2132
+
2133
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
2134
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
2135
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
2136
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
2137
+ }
2138
+ } break;
2139
+ case LLM_ARCH_PHI2:
2140
+ {
2141
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2142
+
2143
+ // output
2144
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2145
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2146
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2147
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
2148
+
2149
+ for (int i = 0; i < n_layer; ++i) {
2150
+ auto & layer = layers[i];
2151
+
2152
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2153
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2154
+
2155
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2156
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2157
+
2158
+ if (layer.wqkv == nullptr) {
2159
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2160
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2161
+
2162
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2163
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
2164
+
2165
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2166
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2167
+ }
2168
+
2169
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2170
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2171
+
2172
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2173
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2174
+
2175
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2176
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2177
+ }
2178
+ } break;
2179
+ case LLM_ARCH_PHI3:
2180
+ {
2181
+ const int64_t n_embd_head = n_embd / n_head;
2182
+
2183
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2184
+
2185
+ // output
2186
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2187
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
2188
+
2189
+ for (int i = 0; i < n_layer; ++i) {
2190
+ auto & layer = layers[i];
2191
+
2192
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2193
+
2194
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
2195
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
2196
+
2197
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2198
+
2199
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2200
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
2201
+
2202
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2203
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2204
+ }
2205
+ } break;
2206
+ case LLM_ARCH_PLAMO:
2207
+ {
2208
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2209
+
2210
+ // output
2211
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2212
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2213
+
2214
+ for (int i = 0; i < n_layer; ++i) {
2215
+ auto & layer = layers[i];
2216
+
2217
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2218
+
2219
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2220
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2221
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2222
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2223
+
2224
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2225
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2226
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2227
+ }
2228
+ } break;
2229
+ case LLM_ARCH_GPT2:
2230
+ {
2231
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2232
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
2233
+
2234
+ // output
2235
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2236
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2237
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2238
+
2239
+ for (int i = 0; i < n_layer; ++i) {
2240
+ auto & layer = layers[i];
2241
+
2242
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2243
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2244
+
2245
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2246
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2247
+
2248
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2249
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2250
+
2251
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2252
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2253
+
2254
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2255
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2256
+
2257
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2258
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2259
+ }
2260
+ } break;
2261
+ case LLM_ARCH_CODESHELL:
2262
+ {
2263
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2264
+
2265
+ // output
2266
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2267
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2268
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2269
+
2270
+ for (int i = 0; i < n_layer; ++i) {
2271
+ auto & layer = layers[i];
2272
+
2273
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2274
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2275
+
2276
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2277
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2278
+
2279
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2280
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2281
+
2282
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2283
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2284
+
2285
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2286
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2287
+
2288
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2289
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2290
+ }
2291
+ } break;
2292
+ case LLM_ARCH_ORION:
2293
+ {
2294
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2295
+
2296
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2297
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2298
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2299
+
2300
+ for (int i = 0; i < n_layer; ++i) {
2301
+ auto & layer = layers[i];
2302
+
2303
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2304
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2305
+
2306
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2307
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2308
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2309
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2310
+
2311
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2312
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2313
+
2314
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2315
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2316
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2317
+ }
2318
+ } break;
2319
+ case LLM_ARCH_INTERNLM2:
2320
+ {
2321
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2322
+
2323
+ // output
2324
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2325
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2326
+
2327
+ for (int i = 0; i < n_layer; ++i) {
2328
+ auto & layer = layers[i];
2329
+
2330
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2331
+ // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2332
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2333
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2334
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2335
+
2336
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2337
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2338
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2339
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2340
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2341
+ }
2342
+ } break;
2343
+ case LLM_ARCH_GEMMA:
2344
+ {
2345
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2346
+
2347
+ // output
2348
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2349
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
2350
+
2351
+ for (int i = 0; i < n_layer; ++i) {
2352
+ auto & layer = layers[i];
2353
+
2354
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2355
+
2356
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2357
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2358
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2359
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2360
+
2361
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2362
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2363
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2364
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2365
+ }
2366
+ } break;
2367
+ case LLM_ARCH_GEMMA2:
2368
+ {
2369
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2370
+
2371
+ // output
2372
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2373
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
2374
+
2375
+ for (int i = 0; i < n_layer; ++i) {
2376
+ auto & layer = layers[i];
2377
+
2378
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2379
+
2380
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2381
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2382
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2383
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2384
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
2385
+
2386
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2387
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2388
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2389
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2390
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2391
+ }
2392
+ } break;
2393
+ case LLM_ARCH_STARCODER2:
2394
+ {
2395
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2396
+
2397
+ // output
2398
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2399
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2400
+
2401
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2402
+ // if output is NULL, init from the input tok embed
2403
+ if (output == NULL) {
2404
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2405
+ }
2406
+
2407
+ for (int i = 0; i < n_layer; ++i) {
2408
+ auto & layer = layers[i];
2409
+
2410
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2411
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2412
+
2413
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2414
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2415
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2416
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2417
+
2418
+ // optional bias tensors
2419
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2420
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
2421
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2422
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2423
+
2424
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2425
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2426
+
2427
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2428
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2429
+
2430
+ // optional bias tensors
2431
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2432
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
2433
+ }
2434
+ } break;
2435
+ case LLM_ARCH_MAMBA:
2436
+ {
2437
+ const int64_t d_conv = hparams.ssm_d_conv;
2438
+ const int64_t d_inner = hparams.ssm_d_inner;
2439
+ const int64_t d_state = hparams.ssm_d_state;
2440
+ const int64_t dt_rank = hparams.ssm_dt_rank;
2441
+
2442
+ // only an expansion factor of 2 is supported for now
2443
+ if (2 * n_embd != d_inner) {
2444
+ throw std::runtime_error("only an expansion factor of 2 is supported for now");
2445
+ }
2446
+
2447
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2448
+
2449
+ // output
2450
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2451
+
2452
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2453
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
2454
+ if (output == NULL) {
2455
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2456
+ }
2457
+
2458
+ for (int i = 0; i < n_layer; ++i) {
2459
+ auto & layer = layers[i];
2460
+
2461
+ // norm
2462
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2463
+
2464
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
2465
+
2466
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
2467
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
2468
+
2469
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
2470
+
2471
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
2472
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
2473
+
2474
+ // no "weight" suffix for these
2475
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
2476
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
2477
+
2478
+ // out_proj
2479
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
2480
+ }
2481
+ } break;
2482
+ case LLM_ARCH_XVERSE:
2483
+ {
2484
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2485
+
2486
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2487
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2488
+
2489
+ for (int i = 0; i < n_layer; ++i) {
2490
+ auto & layer = layers[i];
2491
+
2492
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2493
+
2494
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2495
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2496
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2497
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2498
+
2499
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2500
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2501
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2502
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2503
+ }
2504
+ } break;
2505
+ case LLM_ARCH_COMMAND_R:
2506
+ {
2507
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2508
+
2509
+ // output
2510
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2511
+ // init output from the input tok embed
2512
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2513
+
2514
+ for (int i = 0; i < n_layer; ++i) {
2515
+ auto & layer = layers[i];
2516
+
2517
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2518
+
2519
+ if (n_layer >= 64){
2520
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
2521
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
2522
+ }
2523
+
2524
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2525
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2526
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2527
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2528
+
2529
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2530
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2531
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2532
+ }
2533
+ } break;
2534
+ case LLM_ARCH_COHERE2:
2535
+ {
2536
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2537
+
2538
+ // output
2539
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2540
+ // init output from the input tok embed
2541
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
2542
+ TENSOR_DUPLICATED);
2543
+
2544
+ for (int i = 0; i < n_layer; ++i) {
2545
+ auto & layer = layers[i];
2546
+
2547
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2548
+
2549
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
2550
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
2551
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
2552
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
2553
+
2554
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2555
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2556
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2557
+ }
2558
+ }
2559
+ break;
2560
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
2561
+ {
2562
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2563
+
2564
+ // output
2565
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2566
+ // if output is NULL, init from the input tok embed
2567
+ if (output == NULL) {
2568
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2569
+ }
2570
+
2571
+ for (int i = 0; i < n_layer; ++i) {
2572
+ auto & layer = layers[i];
2573
+
2574
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2575
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2576
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2577
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2578
+
2579
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2580
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2581
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2582
+ }
2583
+ } break;
2584
+ case LLM_ARCH_OLMO2:
2585
+ {
2586
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2587
+
2588
+ // output
2589
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2590
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2591
+
2592
+ for (int i = 0; i < n_layer; ++i) {
2593
+ auto & layer = layers[i];
2594
+
2595
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2596
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2597
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2598
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2599
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
2600
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
2601
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
2602
+
2603
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2604
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2605
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2606
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2607
+ }
2608
+ } break;
2609
+ case LLM_ARCH_OLMOE:
2610
+ {
2611
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2612
+
2613
+ // output
2614
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2615
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2616
+
2617
+ for (int i = 0; i < n_layer; ++i) {
2618
+ auto & layer = layers[i];
2619
+
2620
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2621
+
2622
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2623
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2624
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2625
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2626
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
2627
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
2628
+
2629
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2630
+
2631
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2632
+
2633
+ if (n_expert == 0) {
2634
+ throw std::runtime_error("n_expert must be > 0");
2635
+ }
2636
+ if (n_expert_used == 0) {
2637
+ throw std::runtime_error("n_expert_used must be > 0");
2638
+ }
2639
+
2640
+ // MoE branch
2641
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2642
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
2643
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2644
+ }
2645
+ } break;
2646
+ case LLM_ARCH_OPENELM:
2647
+ {
2648
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2649
+
2650
+ // output
2651
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2652
+ // init output from the input tok embed
2653
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2654
+
2655
+ for (int i = 0; i < n_layer; ++i) {
2656
+ const int64_t n_head = hparams.n_head(i);
2657
+ const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
2658
+ const int64_t n_ff = hparams.n_ff(i);
2659
+
2660
+ auto & layer = layers[i];
2661
+
2662
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2663
+
2664
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
2665
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2666
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2667
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
2668
+
2669
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2670
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2671
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2672
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2673
+ }
2674
+ } break;
2675
+ case LLM_ARCH_GPTNEOX:
2676
+ {
2677
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2678
+
2679
+ // output
2680
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2681
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2682
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2683
+
2684
+ for (int i = 0; i < n_layer; ++i) {
2685
+ auto & layer = layers[i];
2686
+
2687
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2688
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2689
+
2690
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2691
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2692
+
2693
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2694
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2695
+
2696
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2697
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2698
+
2699
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2700
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2701
+
2702
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2703
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2704
+ }
2705
+ } break;
2706
+ case LLM_ARCH_ARCTIC:
2707
+ {
2708
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2709
+
2710
+ // output
2711
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2712
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2713
+
2714
+ // if output is NULL, init from the input tok embed
2715
+ if (output == NULL) {
2716
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2717
+ }
2718
+
2719
+ for (int i = 0; i < n_layer; ++i) {
2720
+ auto & layer = layers[i];
2721
+
2722
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2723
+
2724
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2725
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2726
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2727
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2728
+
2729
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2730
+
2731
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
2732
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
2733
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
2734
+
2735
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2736
+ layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
2737
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
2738
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2739
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2740
+ }
2741
+ } break;
2742
+ case LLM_ARCH_DEEPSEEK:
2743
+ {
2744
+
2745
+ const int64_t n_ff_exp = hparams.n_ff_exp;
2746
+ const int64_t n_expert_shared = hparams.n_expert_shared;
2747
+
2748
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2749
+
2750
+ // output
2751
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2752
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2753
+
2754
+ for (int i = 0; i < n_layer; ++i) {
2755
+ auto & layer = layers[i];
2756
+
2757
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2758
+
2759
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2760
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2761
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2762
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2763
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2764
+
2765
+ if (i < (int) hparams.n_layer_dense_lead) {
2766
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2767
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2768
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2769
+ } else {
2770
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2771
+
2772
+ if (n_expert == 0) {
2773
+ throw std::runtime_error("n_expert must be > 0");
2774
+ }
2775
+ if (n_expert_used == 0) {
2776
+ throw std::runtime_error("n_expert_used must be > 0");
2777
+ }
2778
+
2779
+ // MoE branch
2780
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2781
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2782
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2783
+
2784
+ // Shared expert branch
2785
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
2786
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
2787
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
2788
+ }
2789
+ }
2790
+ } break;
2791
+ case LLM_ARCH_DEEPSEEK2:
2792
+ {
2793
+ const bool is_lite = (hparams.n_layer == 27);
2794
+
2795
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
2796
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
2797
+
2798
+ const int64_t q_lora_rank = hparams.n_lora_q;
2799
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
2800
+
2801
+ const int64_t n_ff_exp = hparams.n_ff_exp;
2802
+ const int64_t n_expert_shared = hparams.n_expert_shared;
2803
+
2804
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2805
+
2806
+ // output
2807
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2808
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2809
+
2810
+ for (int i = 0; i < n_layer; ++i) {
2811
+ auto & layer = layers[i];
2812
+
2813
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2814
+ if (!is_lite) {
2815
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
2816
+ }
2817
+
2818
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
2819
+
2820
+ if (!is_lite) {
2821
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
2822
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
2823
+ } else {
2824
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2825
+ }
2826
+
2827
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
2828
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
2829
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
2830
+
2831
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2832
+
2833
+ if (i < (int) hparams.n_layer_dense_lead) {
2834
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2835
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2836
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2837
+ } else {
2838
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2839
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
2840
+
2841
+ if (n_expert == 0) {
2842
+ throw std::runtime_error("n_expert must be > 0");
2843
+ }
2844
+ if (n_expert_used == 0) {
2845
+ throw std::runtime_error("n_expert_used must be > 0");
2846
+ }
2847
+
2848
+ // MoE branch
2849
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2850
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2851
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2852
+
2853
+ // Shared expert branch
2854
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
2855
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
2856
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
2857
+ }
2858
+ }
2859
+ } break;
2860
+ case LLM_ARCH_BITNET:
2861
+ {
2862
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2863
+
2864
+ // output
2865
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2866
+
2867
+ for (int i = 0; i < n_layer; ++i) {
2868
+ auto & layer = layers[i];
2869
+
2870
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2871
+ layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
2872
+
2873
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2874
+ layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
2875
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2876
+ layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
2877
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2878
+ layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
2879
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2880
+ layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
2881
+
2882
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2883
+ layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
2884
+
2885
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2886
+ layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
2887
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2888
+ layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
2889
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2890
+ layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
2891
+ }
2892
+ } break;
2893
+ case LLM_ARCH_T5:
2894
+ {
2895
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
2896
+
2897
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2898
+
2899
+ // output
2900
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
2901
+ output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
2902
+
2903
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2904
+ // if output is NULL, init from the input tok embed
2905
+ if (output == NULL) {
2906
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2907
+ }
2908
+
2909
+ for (int i = 0; i < n_layer; ++i) {
2910
+ auto & layer = layers[i];
2911
+
2912
+ layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
2913
+ layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
2914
+
2915
+ layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2916
+ layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2917
+ layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2918
+ layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
2919
+
2920
+ layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
2921
+ layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2922
+ layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2923
+ layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2924
+
2925
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
2926
+ layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
2927
+
2928
+ layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2929
+ layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2930
+ layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2931
+ layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
2932
+
2933
+ layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
2934
+ // this tensor seems to be unused in HF transformers implementation
2935
+ layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
2936
+
2937
+ layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2938
+ layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2939
+ layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2940
+ layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
2941
+
2942
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
2943
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2944
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2945
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2946
+ }
2947
+ } break;
2948
+ case LLM_ARCH_T5ENCODER:
2949
+ {
2950
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
2951
+
2952
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2953
+
2954
+ // output
2955
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
2956
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2957
+ // if output is NULL, init from the input tok embed
2958
+ if (output == NULL) {
2959
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2960
+ }
2961
+
2962
+ for (int i = 0; i < n_layer; ++i) {
2963
+ auto & layer = layers[i];
2964
+
2965
+ layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
2966
+ layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
2967
+
2968
+ layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2969
+ layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2970
+ layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2971
+ layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
2972
+
2973
+ layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
2974
+ layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2975
+ layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2976
+ layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2977
+ }
2978
+ } break;
2979
+ case LLM_ARCH_JAIS:
2980
+ {
2981
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2982
+
2983
+ // output
2984
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2985
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2986
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2987
+
2988
+ for (int i = 0; i < n_layer; ++i) {
2989
+ auto & layer = layers[i];
2990
+
2991
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2992
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2993
+
2994
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2995
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2996
+
2997
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2998
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2999
+
3000
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3001
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3002
+
3003
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3004
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
3005
+
3006
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3007
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
3008
+
3009
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3010
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
3011
+ }
3012
+ } break;
3013
+ case LLM_ARCH_CHATGLM:
3014
+ {
3015
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3016
+
3017
+ // output
3018
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3019
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3020
+
3021
+ for (int i = 0; i < n_layer; ++i) {
3022
+ auto & layer = layers[i];
3023
+
3024
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3025
+
3026
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3027
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3028
+
3029
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3030
+
3031
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3032
+
3033
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3034
+
3035
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3036
+ }
3037
+ } break;
3038
+ case LLM_ARCH_NEMOTRON:
3039
+ {
3040
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3041
+
3042
+ // output
3043
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3044
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3045
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3046
+
3047
+ for (int i = 0; i < n_layer; ++i) {
3048
+ auto & layer = layers[i];
3049
+
3050
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3051
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3052
+
3053
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3054
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3055
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3056
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3057
+
3058
+ // optional bias tensors
3059
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3060
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3061
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3062
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3063
+
3064
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3065
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
3066
+
3067
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3068
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3069
+
3070
+ // optional MLP bias
3071
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3072
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3073
+ }
3074
+ } break;
3075
+ case LLM_ARCH_EXAONE:
3076
+ {
3077
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3078
+
3079
+ // output
3080
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3081
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3082
+
3083
+ for (int i = 0; i < n_layer; ++i) {
3084
+ auto & layer = layers[i];
3085
+
3086
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3087
+
3088
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3089
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3090
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3091
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3092
+
3093
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3094
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3095
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3096
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3097
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3098
+ }
3099
+ } break;
3100
+ case LLM_ARCH_RWKV6:
3101
+ {
3102
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3103
+
3104
+ // Block 0, LN0
3105
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
3106
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
3107
+
3108
+ // output
3109
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3110
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3111
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3112
+
3113
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
3114
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
3115
+ const int head_size = hparams.wkv_head_size;
3116
+ const int attn_hidden_size = n_embd;
3117
+ const int ffn_size = hparams.n_ff_arr[0];
3118
+
3119
+ for (int i = 0; i < n_layer; ++i) {
3120
+ auto & layer = layers[i];
3121
+
3122
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3123
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3124
+
3125
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
3126
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
3127
+
3128
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
3129
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
3130
+
3131
+ layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
3132
+ layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3133
+ layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3134
+ layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3135
+ layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3136
+ layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3137
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
3138
+ GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
3139
+
3140
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
3141
+ layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
3142
+ layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
3143
+ layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
3144
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
3145
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
3146
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
3147
+ layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
3148
+
3149
+ layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
3150
+ layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
3151
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
3152
+
3153
+ layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
3154
+ layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
3155
+
3156
+ layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
3157
+ layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
3158
+ layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
3159
+ }
3160
+
3161
+ } break;
3162
+ case LLM_ARCH_RWKV6QWEN2:
3163
+ {
3164
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3165
+
3166
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3167
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3168
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3169
+
3170
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
3171
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
3172
+ const int head_size = hparams.wkv_head_size;
3173
+ const int attn_hidden_size = n_embd;
3174
+ const int n_head_kv = hparams.n_head_kv();
3175
+ int attn_key_value_size;
3176
+ if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
3177
+ attn_key_value_size = attn_hidden_size;
3178
+ } else {
3179
+ attn_key_value_size = n_head_kv * head_size;
3180
+ }
3181
+
3182
+ for (int i = 0; i < n_layer; ++i) {
3183
+ auto & layer = layers[i];
3184
+
3185
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3186
+
3187
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
3188
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
3189
+
3190
+ layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
3191
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
3192
+
3193
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3194
+ layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
3195
+ layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
3196
+ layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
3197
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
3198
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
3199
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
3200
+ layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
3201
+ // optional bias tensors
3202
+ layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3203
+ layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3204
+ layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3205
+
3206
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
3207
+
3208
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3209
+
3210
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3211
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3212
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3213
+ }
3214
+ } break;
3215
+ case LLM_ARCH_CHAMELEON:
3216
+ {
3217
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3218
+
3219
+ // output
3220
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3221
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3222
+ // if output is NULL, init from the input tok embed
3223
+ if (output == NULL) {
3224
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3225
+ }
3226
+
3227
+ for (int i = 0; i < n_layer; ++i) {
3228
+ auto & layer = layers[i];
3229
+
3230
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3231
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
3232
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
3233
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
3234
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
3235
+
3236
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3237
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3238
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3239
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3240
+
3241
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3242
+
3243
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3244
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3245
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3246
+ }
3247
+ } break;
3248
+ case LLM_ARCH_WAVTOKENIZER_DEC:
3249
+ {
3250
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
3251
+
3252
+ conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
3253
+ conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
3254
+
3255
+ // posnet
3256
+ {
3257
+ const int64_t n_embd = hparams.posnet.n_embd;
3258
+
3259
+ for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
3260
+ auto & layer = layers[i].posnet;
3261
+
3262
+ // posnet:
3263
+ //
3264
+ // - resnet
3265
+ // - resnet
3266
+ // - attn
3267
+ // - resnet
3268
+ // - resnet
3269
+ // - norm
3270
+ //
3271
+ switch (i) {
3272
+ case 0:
3273
+ case 1:
3274
+ case 3:
3275
+ case 4:
3276
+ {
3277
+ layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
3278
+ layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
3279
+
3280
+ layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
3281
+ layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
3282
+
3283
+ layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
3284
+ layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
3285
+
3286
+ layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
3287
+ layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
3288
+ } break;
3289
+ case 2:
3290
+ {
3291
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
3292
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
3293
+
3294
+ layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
3295
+ layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
3296
+
3297
+ layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
3298
+ layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
3299
+
3300
+ layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
3301
+ layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
3302
+
3303
+ layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
3304
+ layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
3305
+ } break;
3306
+ case 5:
3307
+ {
3308
+ layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
3309
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
3310
+ } break;
3311
+ default: GGML_ABORT("unknown posnet layer");
3312
+ };
3313
+ }
3314
+ }
3315
+
3316
+ GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
3317
+
3318
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
3319
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
3320
+
3321
+ // convnext
3322
+ {
3323
+ const int64_t n_embd = hparams.convnext.n_embd;
3324
+
3325
+ for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
3326
+ auto & layer = layers[i].convnext;
3327
+
3328
+ layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
3329
+ layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
3330
+
3331
+ layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
3332
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
3333
+
3334
+ layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
3335
+ layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
3336
+
3337
+ layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
3338
+ layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
3339
+
3340
+ layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
3341
+ }
3342
+
3343
+ // output
3344
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3345
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3346
+ }
3347
+
3348
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
3349
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
3350
+ } break;
3351
+ default:
3352
+ throw std::runtime_error("unknown architecture");
3353
+ }
3354
+
3355
+ if (n_moved_tensors > 0) {
3356
+ LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
3357
+ __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
3358
+ ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
3359
+ }
3360
+ }
3361
+
3362
+ ml.done_getting_tensors();
3363
+
3364
+ ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
3365
+ pimpl->mappings.reserve(ml.mappings.size());
3366
+
3367
+ // create the backend buffers
3368
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
3369
+ ctx_bufs.reserve(ctx_map.size());
3370
+
3371
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
3372
+ const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
3373
+ pimpl->bufs.reserve(n_max_backend_buffer);
3374
+
3375
+ for (auto & it : ctx_map) {
3376
+ ggml_backend_buffer_type_t buft = it.first;
3377
+ ggml_context * ctx = it.second;
3378
+
3379
+ // skip contexts without tensors
3380
+ if (ggml_get_first_tensor(ctx) == nullptr) {
3381
+ continue;
3382
+ }
3383
+
3384
+ llama_buf_map buf_map;
3385
+ buf_map.reserve(n_max_backend_buffer);
3386
+
3387
+ // check if it is possible to use buffer_from_host_ptr with this buffer type
3388
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
3389
+ if (!dev) {
3390
+ // FIXME: workaround for CPU backend buft having a NULL device
3391
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
3392
+ }
3393
+ ggml_backend_dev_props props;
3394
+ ggml_backend_dev_get_props(dev, &props);
3395
+ bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
3396
+ bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
3397
+
3398
+ if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
3399
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
3400
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
3401
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
3402
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
3403
+ void * addr = nullptr;
3404
+ size_t first, last; // NOLINT
3405
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
3406
+ if (first >= last) {
3407
+ continue;
3408
+ }
3409
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
3410
+ ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
3411
+ if (buf == nullptr) {
3412
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
3413
+ }
3414
+ pimpl->bufs.emplace_back(buf);
3415
+ buf_map.emplace(idx, buf);
3416
+ }
3417
+ }
3418
+ else {
3419
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
3420
+ if (buf == nullptr) {
3421
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
3422
+ }
3423
+ pimpl->bufs.emplace_back(buf);
3424
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
3425
+ pimpl->mlock_bufs.emplace_back(new llama_mlock);
3426
+ auto & mlock_buf = pimpl->mlock_bufs.back();
3427
+ mlock_buf->init (ggml_backend_buffer_get_base(buf));
3428
+ mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
3429
+ }
3430
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
3431
+ buf_map.emplace(idx, buf);
3432
+ }
3433
+ }
3434
+
3435
+ if (pimpl->bufs.empty()) {
3436
+ throw std::runtime_error("failed to allocate buffer");
3437
+ }
3438
+
3439
+ for (auto & buf : buf_map) {
3440
+ // indicate that this buffer contains weights
3441
+ // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
3442
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
3443
+ }
3444
+
3445
+ ctx_bufs.emplace_back(ctx, buf_map);
3446
+ }
3447
+
3448
+ if (llama_supports_gpu_offload()) {
3449
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
3450
+
3451
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
3452
+ if (n_gpu_layers > (int) hparams.n_layer) {
3453
+ LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
3454
+ }
3455
+
3456
+ const int max_backend_supported_layers = hparams.n_layer + 1;
3457
+ const int max_offloadable_layers = hparams.n_layer + 1;
3458
+
3459
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3460
+ }
3461
+
3462
+ // print memory requirements per buffer type
3463
+ for (auto & buf : pimpl->bufs) {
3464
+ LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
3465
+ }
3466
+
3467
+ // populate tensors_by_name
3468
+ for (auto & ctx : pimpl->ctxs) {
3469
+ for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
3470
+ tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3471
+ }
3472
+ }
3473
+
3474
+ // load tensor data
3475
+ for (auto & it : ctx_bufs) {
3476
+ ggml_context * ctx = it.first;
3477
+ auto & bufs = it.second;
3478
+ if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
3479
+ return false;
3480
+ }
3481
+ }
3482
+
3483
+ if (use_mmap_buffer) {
3484
+ for (auto & mapping : ml.mappings) {
3485
+ pimpl->mappings.emplace_back(std::move(mapping));
3486
+ }
3487
+ }
3488
+
3489
+ return true;
3490
+ }
3491
+
3492
+ std::string llama_model::arch_name() const {
3493
+ return llm_arch_name(arch);
3494
+ }
3495
+
3496
+ std::string llama_model::type_name() const {
3497
+ return llm_type_name(type);
3498
+ }
3499
+
3500
+ std::string llama_model::desc() const {
3501
+ return pimpl->desc_str;
3502
+ }
3503
+
3504
+ size_t llama_model::size() const {
3505
+ return pimpl->n_bytes;
3506
+ }
3507
+
3508
+ size_t llama_model::max_nodes() const {
3509
+ return std::max<size_t>(8192, tensors_by_name.size()*5);
3510
+ }
3511
+
3512
+ size_t llama_model::n_devices() const {
3513
+ return devices.size();
3514
+ }
3515
+
3516
+ uint64_t llama_model::n_elements() const {
3517
+ return pimpl->n_elements;
3518
+ }
3519
+
3520
+ void llama_model::print_info() const {
3521
+ const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
3522
+
3523
+ auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
3524
+ bool is_var = false;
3525
+
3526
+ std::vector<uint32_t> v;
3527
+ for (uint32_t i = 0; i < n; ++i) {
3528
+ v.push_back(f(i));
3529
+ if (v[i] != v[0]) {
3530
+ is_var = true;
3531
+ }
3532
+ }
3533
+
3534
+ std::stringstream ss;
3535
+
3536
+ if (is_var) {
3537
+ ss << "[";
3538
+ for (uint32_t i = 0; i < n; ++i) {
3539
+ ss << v[i];
3540
+ if (i < n - 1) {
3541
+ ss << ", ";
3542
+ }
3543
+ }
3544
+ ss << "]";
3545
+ } else {
3546
+ ss << v[0];
3547
+ }
3548
+
3549
+ return ss.str();
3550
+ };
3551
+
3552
+ // hparams
3553
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
3554
+ LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
3555
+
3556
+ if (!hparams.vocab_only) {
3557
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
3558
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
3559
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
3560
+ LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
3561
+ LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
3562
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
3563
+ LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
3564
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
3565
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
3566
+ LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
3567
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
3568
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
3569
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
3570
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
3571
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
3572
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
3573
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
3574
+ LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
3575
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3576
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3577
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
3578
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3579
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3580
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3581
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3582
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3583
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
3584
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3585
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
3586
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
3587
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
3588
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
3589
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
3590
+ }
3591
+
3592
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
3593
+ if (pimpl->n_elements >= 1e12) {
3594
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
3595
+ } else if (pimpl->n_elements >= 1e9) {
3596
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
3597
+ } else if (pimpl->n_elements >= 1e6) {
3598
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
3599
+ } else {
3600
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
3601
+ }
3602
+
3603
+ // general kv
3604
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
3605
+
3606
+ if (arch == LLM_ARCH_DEEPSEEK) {
3607
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
3608
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
3609
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
3610
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
3611
+ }
3612
+
3613
+ if (arch == LLM_ARCH_DEEPSEEK2) {
3614
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
3615
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
3616
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
3617
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
3618
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
3619
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
3620
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
3621
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
3622
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
3623
+ }
3624
+
3625
+ if (arch == LLM_ARCH_QWEN2MOE) {
3626
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
3627
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
3628
+ }
3629
+
3630
+ if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
3631
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
3632
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
3633
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
3634
+ }
3635
+
3636
+ vocab.print_info();
3637
+ }
3638
+
3639
+ ggml_backend_dev_t llama_model::dev_layer(int il) const {
3640
+ return pimpl->dev_layer.at(il).dev;
3641
+ }
3642
+
3643
+ ggml_backend_dev_t llama_model::dev_output() const {
3644
+ return pimpl->dev_output.dev;
3645
+ }
3646
+
3647
+ template<typename F>
3648
+ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
3649
+ ggml_init_params params = {
3650
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
3651
+ /*.mem_buffer =*/ NULL,
3652
+ /*.no_alloc =*/ true,
3653
+ };
3654
+
3655
+ ggml_context_ptr ctx { ggml_init(params) };
3656
+ if (!ctx) {
3657
+ throw std::runtime_error(format("failed to create ggml context"));
3658
+ }
3659
+
3660
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
3661
+ ggml_tensor * op_tensor = fn(ctx.get());
3662
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
3663
+ if (op_tensor->src[i] != nullptr) {
3664
+ assert(op_tensor->src[i]->buffer == nullptr);
3665
+ op_tensor->src[i]->buffer = buf.get();
3666
+ }
3667
+ }
3668
+
3669
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
3670
+
3671
+ return op_supported;
3672
+ }
3673
+
3674
+ template<typename F>
3675
+ static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
3676
+ for (const auto & cur : buft_list) {
3677
+ ggml_backend_dev_t cur_dev = cur.first;
3678
+ ggml_backend_buffer_type_t cur_buft = cur.second;
3679
+ if (buft_supported(cur_buft, cur_dev, fn)) {
3680
+ return cur_buft;
3681
+ }
3682
+ }
3683
+
3684
+ throw std::runtime_error(format("no suitable buffer type found"));
3685
+ }
3686
+
3687
+ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
3688
+ return ::select_buft(
3689
+ *pimpl->dev_layer.at(il).buft_list,
3690
+ [&](ggml_context * ctx) {
3691
+ ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
3692
+ ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
3693
+ return ggml_add(ctx, cur, layer_dir);
3694
+ });
3695
+ }
3696
+
3697
+ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
3698
+ auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
3699
+ [name](const std::pair<std::string, struct ggml_tensor *> & it) {
3700
+ return it.first == name;
3701
+ });
3702
+ if (it == tensors_by_name.end()) {
3703
+ return nullptr;
3704
+ }
3705
+
3706
+ return it->second;
3707
+ }
3708
+
3709
+ //
3710
+ // interface implementation
3711
+ //
3712
+
3713
+ struct llama_model_params llama_model_default_params() {
3714
+ struct llama_model_params result = {
3715
+ /*.devices =*/ nullptr,
3716
+ /*.n_gpu_layers =*/ 0,
3717
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
3718
+ /*.main_gpu =*/ 0,
3719
+ /*.tensor_split =*/ nullptr,
3720
+ /*.progress_callback =*/ nullptr,
3721
+ /*.progress_callback_user_data =*/ nullptr,
3722
+ /*.kv_overrides =*/ nullptr,
3723
+ /*.vocab_only =*/ false,
3724
+ /*.use_mmap =*/ true,
3725
+ /*.use_mlock =*/ false,
3726
+ /*.check_tensors =*/ false,
3727
+ };
3728
+
3729
+ #ifdef GGML_USE_METAL
3730
+ // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
3731
+ result.n_gpu_layers = 999;
3732
+ #endif
3733
+
3734
+ return result;
3735
+ }
3736
+
3737
+ const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model) {
3738
+ return &model->vocab;
3739
+ }
3740
+
3741
+ void llama_free_model(struct llama_model * model) {
3742
+ llama_model_free(model);
3743
+ }
3744
+
3745
+ void llama_model_free(struct llama_model * model) {
3746
+ delete model;
3747
+ }
3748
+
3749
+ int32_t llama_model_n_ctx_train(const struct llama_model * model) {
3750
+ return model->hparams.n_ctx_train;
3751
+ }
3752
+
3753
+ int32_t llama_model_n_embd(const struct llama_model * model) {
3754
+ return model->hparams.n_embd;
3755
+ }
3756
+
3757
+ int32_t llama_model_n_layer(const struct llama_model * model) {
3758
+ return model->hparams.n_layer;
3759
+ }
3760
+
3761
+ int32_t llama_model_n_head(const struct llama_model * model) {
3762
+ return model->hparams.n_head();
3763
+ }
3764
+
3765
+ // deprecated
3766
+ int32_t llama_n_ctx_train(const struct llama_model * model) {
3767
+ return llama_model_n_ctx_train(model);
3768
+ }
3769
+
3770
+ // deprecated
3771
+ int32_t llama_n_embd(const struct llama_model * model) {
3772
+ return llama_model_n_embd(model);
3773
+ }
3774
+
3775
+ // deprecated
3776
+ int32_t llama_n_layer(const struct llama_model * model) {
3777
+ return llama_model_n_layer(model);
3778
+ }
3779
+
3780
+ // deprecated
3781
+ int32_t llama_n_head(const struct llama_model * model) {
3782
+ return llama_model_n_head(model);
3783
+ }
3784
+
3785
+ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
3786
+ switch (model->arch) {
3787
+ // these models do not use RoPE
3788
+ case LLM_ARCH_GPT2:
3789
+ case LLM_ARCH_GPTJ:
3790
+ case LLM_ARCH_MPT:
3791
+ case LLM_ARCH_REFACT:
3792
+ case LLM_ARCH_BLOOM:
3793
+ case LLM_ARCH_MAMBA:
3794
+ case LLM_ARCH_JINA_BERT_V2:
3795
+ case LLM_ARCH_T5:
3796
+ case LLM_ARCH_T5ENCODER:
3797
+ case LLM_ARCH_JAIS:
3798
+ case LLM_ARCH_RWKV6:
3799
+ case LLM_ARCH_RWKV6QWEN2:
3800
+ case LLM_ARCH_WAVTOKENIZER_DEC:
3801
+ return LLAMA_ROPE_TYPE_NONE;
3802
+
3803
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
3804
+ case LLM_ARCH_LLAMA:
3805
+ case LLM_ARCH_DECI:
3806
+ case LLM_ARCH_BAICHUAN:
3807
+ case LLM_ARCH_STARCODER:
3808
+ case LLM_ARCH_PLAMO:
3809
+ case LLM_ARCH_ORION:
3810
+ case LLM_ARCH_INTERNLM2:
3811
+ case LLM_ARCH_MINICPM:
3812
+ case LLM_ARCH_XVERSE:
3813
+ case LLM_ARCH_COMMAND_R:
3814
+ case LLM_ARCH_COHERE2:
3815
+ case LLM_ARCH_OLMO:
3816
+ case LLM_ARCH_ARCTIC:
3817
+ case LLM_ARCH_DEEPSEEK:
3818
+ case LLM_ARCH_DEEPSEEK2:
3819
+ case LLM_ARCH_CHATGLM:
3820
+ case LLM_ARCH_GRANITE:
3821
+ case LLM_ARCH_GRANITE_MOE:
3822
+ case LLM_ARCH_CHAMELEON:
3823
+ return LLAMA_ROPE_TYPE_NORM;
3824
+
3825
+ // the pairs of head values are offset by n_rot/2
3826
+ case LLM_ARCH_FALCON:
3827
+ case LLM_ARCH_GROK:
3828
+ case LLM_ARCH_DBRX:
3829
+ case LLM_ARCH_BERT:
3830
+ case LLM_ARCH_NOMIC_BERT:
3831
+ case LLM_ARCH_STABLELM:
3832
+ case LLM_ARCH_BITNET:
3833
+ case LLM_ARCH_QWEN:
3834
+ case LLM_ARCH_QWEN2:
3835
+ case LLM_ARCH_QWEN2MOE:
3836
+ case LLM_ARCH_OLMO2:
3837
+ case LLM_ARCH_OLMOE:
3838
+ case LLM_ARCH_PHI2:
3839
+ case LLM_ARCH_PHI3:
3840
+ case LLM_ARCH_PHIMOE:
3841
+ case LLM_ARCH_GEMMA:
3842
+ case LLM_ARCH_GEMMA2:
3843
+ case LLM_ARCH_STARCODER2:
3844
+ case LLM_ARCH_OPENELM:
3845
+ case LLM_ARCH_GPTNEOX:
3846
+ case LLM_ARCH_CODESHELL:
3847
+ case LLM_ARCH_NEMOTRON:
3848
+ case LLM_ARCH_EXAONE:
3849
+ case LLM_ARCH_MINICPM3:
3850
+ return LLAMA_ROPE_TYPE_NEOX;
3851
+
3852
+ case LLM_ARCH_QWEN2VL:
3853
+ return LLAMA_ROPE_TYPE_MROPE;
3854
+
3855
+ // all model arches should be listed explicitly here
3856
+ case LLM_ARCH_UNKNOWN:
3857
+ GGML_ABORT("unknown architecture");
3858
+ }
3859
+
3860
+ return LLAMA_ROPE_TYPE_NONE;
3861
+ }
3862
+
3863
+ float llama_model_rope_freq_scale_train(const struct llama_model * model) {
3864
+ return model->hparams.rope_freq_scale_train;
3865
+ }
3866
+
3867
+ int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
3868
+ const auto & it = model->gguf_kv.find(key);
3869
+ if (it == model->gguf_kv.end()) {
3870
+ if (buf_size > 0) {
3871
+ buf[0] = '\0';
3872
+ }
3873
+ return -1;
3874
+ }
3875
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
3876
+ }
3877
+
3878
+ int32_t llama_model_meta_count(const struct llama_model * model) {
3879
+ return (int)model->gguf_kv.size();
3880
+ }
3881
+
3882
+ int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
3883
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
3884
+ if (buf_size > 0) {
3885
+ buf[0] = '\0';
3886
+ }
3887
+ return -1;
3888
+ }
3889
+ auto it = model->gguf_kv.begin();
3890
+ std::advance(it, i);
3891
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
3892
+ }
3893
+
3894
+ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
3895
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
3896
+ if (buf_size > 0) {
3897
+ buf[0] = '\0';
3898
+ }
3899
+ return -1;
3900
+ }
3901
+ auto it = model->gguf_kv.begin();
3902
+ std::advance(it, i);
3903
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
3904
+ }
3905
+
3906
+ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
3907
+ return snprintf(buf, buf_size, "%s", model->desc().c_str());
3908
+ }
3909
+
3910
+ uint64_t llama_model_size(const struct llama_model * model) {
3911
+ return model->size();
3912
+ }
3913
+
3914
+ const char * llama_model_chat_template(const struct llama_model * model) {
3915
+ const auto & it = model->gguf_kv.find(LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE));
3916
+ if (it == model->gguf_kv.end()) {
3917
+ return nullptr;
3918
+ }
3919
+
3920
+ return it->second.c_str();
3921
+ }
3922
+
3923
+ uint64_t llama_model_n_params(const struct llama_model * model) {
3924
+ return model->n_elements();
3925
+ }
3926
+
3927
+ bool llama_model_has_encoder(const struct llama_model * model) {
3928
+ switch (model->arch) {
3929
+ case LLM_ARCH_T5: return true;
3930
+ case LLM_ARCH_T5ENCODER: return true;
3931
+ default: return false;
3932
+ }
3933
+ }
3934
+
3935
+ bool llama_model_has_decoder(const struct llama_model * model) {
3936
+ switch (model->arch) {
3937
+ case LLM_ARCH_T5ENCODER: return false;
3938
+ default: return true;
3939
+ }
3940
+ }
3941
+
3942
+ llama_token llama_model_decoder_start_token(const struct llama_model * model) {
3943
+ return model->hparams.dec_start_token_id;
3944
+ }
3945
+
3946
+ bool llama_model_is_recurrent(const struct llama_model * model) {
3947
+ switch (model->arch) {
3948
+ case LLM_ARCH_MAMBA: return true;
3949
+ case LLM_ARCH_RWKV6: return true;
3950
+ case LLM_ARCH_RWKV6QWEN2: return true;
3951
+ default: return false;
3952
+ }
3953
+ }