cui-llama.rn 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/android/src/main/jni.cpp +9 -9
  2. package/cpp/common.cpp +163 -60
  3. package/cpp/common.h +43 -12
  4. package/cpp/ggml-alloc.c +1042 -1037
  5. package/cpp/ggml-backend-impl.h +255 -256
  6. package/cpp/ggml-backend-reg.cpp +582 -582
  7. package/cpp/ggml-backend.cpp +2002 -2002
  8. package/cpp/ggml-backend.h +354 -352
  9. package/cpp/ggml-common.h +1853 -1853
  10. package/cpp/ggml-cpp.h +39 -39
  11. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  12. package/cpp/ggml-cpu-aarch64.h +8 -8
  13. package/cpp/ggml-cpu-impl.h +386 -386
  14. package/cpp/ggml-cpu-quants.c +10920 -10839
  15. package/cpp/ggml-cpu-traits.cpp +36 -36
  16. package/cpp/ggml-cpu-traits.h +38 -38
  17. package/cpp/ggml-cpu.c +329 -60
  18. package/cpp/ggml-cpu.cpp +10 -2
  19. package/cpp/ggml-cpu.h +135 -135
  20. package/cpp/ggml-impl.h +567 -567
  21. package/cpp/ggml-metal-impl.h +17 -17
  22. package/cpp/ggml-metal.m +4884 -4884
  23. package/cpp/ggml-quants.c +5238 -5238
  24. package/cpp/ggml-threading.h +14 -14
  25. package/cpp/ggml.c +6514 -6448
  26. package/cpp/ggml.h +2194 -2163
  27. package/cpp/gguf.cpp +1329 -1325
  28. package/cpp/gguf.h +202 -202
  29. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  30. package/cpp/json-schema-to-grammar.h +8 -8
  31. package/cpp/json.hpp +24766 -24766
  32. package/cpp/llama-adapter.cpp +347 -346
  33. package/cpp/llama-adapter.h +74 -73
  34. package/cpp/llama-arch.cpp +1487 -1434
  35. package/cpp/llama-arch.h +400 -395
  36. package/cpp/llama-batch.cpp +368 -368
  37. package/cpp/llama-batch.h +88 -88
  38. package/cpp/llama-chat.cpp +578 -567
  39. package/cpp/llama-chat.h +52 -51
  40. package/cpp/llama-context.cpp +1775 -1771
  41. package/cpp/llama-context.h +128 -128
  42. package/cpp/llama-cparams.cpp +1 -1
  43. package/cpp/llama-cparams.h +37 -37
  44. package/cpp/llama-cpp.h +30 -30
  45. package/cpp/llama-grammar.cpp +1139 -1139
  46. package/cpp/llama-grammar.h +143 -143
  47. package/cpp/llama-hparams.cpp +71 -71
  48. package/cpp/llama-hparams.h +139 -140
  49. package/cpp/llama-impl.cpp +167 -167
  50. package/cpp/llama-impl.h +61 -61
  51. package/cpp/llama-kv-cache.cpp +718 -718
  52. package/cpp/llama-kv-cache.h +218 -218
  53. package/cpp/llama-mmap.cpp +2 -1
  54. package/cpp/llama-mmap.h +67 -67
  55. package/cpp/llama-model-loader.cpp +1124 -1011
  56. package/cpp/llama-model-loader.h +167 -158
  57. package/cpp/llama-model.cpp +3997 -2202
  58. package/cpp/llama-model.h +370 -391
  59. package/cpp/llama-sampling.cpp +2408 -2406
  60. package/cpp/llama-sampling.h +32 -48
  61. package/cpp/llama-vocab.cpp +3247 -1982
  62. package/cpp/llama-vocab.h +125 -182
  63. package/cpp/llama.cpp +416 -2886
  64. package/cpp/llama.h +1323 -1285
  65. package/cpp/log.cpp +401 -401
  66. package/cpp/log.h +121 -121
  67. package/cpp/rn-llama.hpp +18 -12
  68. package/cpp/sampling.cpp +505 -500
  69. package/cpp/sgemm.cpp +2597 -2597
  70. package/cpp/speculative.cpp +277 -274
  71. package/cpp/speculative.h +28 -28
  72. package/cpp/unicode.cpp +2 -3
  73. package/package.json +1 -1
package/cpp/llama.cpp CHANGED
@@ -8,2512 +8,80 @@
8
8
  #include "llama-kv-cache.h"
9
9
  #include "llama-model-loader.h"
10
10
  #include "llama-model.h"
11
- #include "llama-kv-cache.h"
12
- #include "llama-model-loader.h"
13
- #include "llama-model.h"
14
-
15
- #include "ggml.h"
16
- #include "ggml-alloc.h"
17
- #include "ggml-backend.h"
18
- #include "ggml-cpp.h"
19
-
20
- #include <algorithm>
21
- #include <array>
22
- #include <cassert>
23
- #include <cfloat>
24
- #include <cmath>
25
- #include <cstddef>
26
- #include <cstdint>
27
- #include <cstdio>
28
- #include <cstring>
29
- #include <ctime>
30
- #include <functional>
31
- #include <initializer_list>
32
- #include <map>
33
-
34
- #if defined(_MSC_VER)
35
- #pragma warning(disable: 4244 4267) // possible loss of data
36
- #endif
37
-
38
- #if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
39
- #include <android/log.h>
40
- #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
41
- #undef LLAMA_LOG_INFO
42
- #undef LLAMA_LOG_WARN
43
- #undef LLAMA_LOG_ERROR
44
- #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
45
- #define LLAMA_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , LLAMA_ANDROID_TAG, __VA_ARGS__)
46
- #define LLAMA_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, LLAMA_ANDROID_TAG, __VA_ARGS__)
47
- #endif // __ANDROID__
48
-
49
- #if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
50
- #include <android/log.h>
51
- #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
52
- #undef LLAMA_LOG_INFO
53
- #undef LLAMA_LOG_WARN
54
- #undef LLAMA_LOG_ERROR
55
- #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
56
- #define LLAMA_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , LLAMA_ANDROID_TAG, __VA_ARGS__)
57
- #define LLAMA_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, LLAMA_ANDROID_TAG, __VA_ARGS__)
58
- #endif // __ANDROID__
59
-
60
- //
61
- // tensor loading (TODO: add llama_tesor_loader?)
62
- //
63
-
64
- static int llama_get_device_count(const llama_model & model) {
65
- return (int) model.devices.size();
66
- }
67
-
68
- // checks if the weight tensor can be used with the specified buffer type and device
69
- static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor * w, lm_ggml_op op, lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_dev_t dev) {
70
- LM_GGML_ASSERT(w != nullptr);
71
-
72
- if (op == LM_GGML_OP_NONE) {
73
- return true;
74
- }
75
-
76
- lm_ggml_init_params params = {
77
- /*.mem_size =*/ lm_ggml_tensor_overhead()*8,
78
- /*.mem_buffer =*/ NULL,
79
- /*.no_alloc =*/ true,
80
- };
81
- lm_ggml_context_ptr ctx_ptr { lm_ggml_init(params) };
82
- if (!ctx_ptr) {
83
- throw std::runtime_error(format("failed to create ggml context"));
84
- }
85
- lm_ggml_context * ctx = ctx_ptr.get();
86
-
87
- lm_ggml_tensor * op_tensor = nullptr;
88
-
89
- switch (op) {
90
- case LM_GGML_OP_GET_ROWS:
91
- {
92
- lm_ggml_tensor * b = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, 512);
93
- op_tensor = lm_ggml_get_rows(ctx, w, b);
94
- } break;
95
- case LM_GGML_OP_MUL_MAT:
96
- {
97
- lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
98
- op_tensor = lm_ggml_mul_mat(ctx, w, b);
99
- } break;
100
- case LM_GGML_OP_MUL_MAT_ID:
101
- {
102
- int n_expert_used = hparams.n_expert_used;
103
- lm_ggml_tensor * b = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
104
- lm_ggml_tensor * ids = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_I32, n_expert_used, 512);
105
- op_tensor = lm_ggml_mul_mat_id(ctx, w, b, ids);
106
- } break;
107
- case LM_GGML_OP_ADD:
108
- {
109
- lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
110
- op_tensor = lm_ggml_add(ctx, a, w);
111
- } break;
112
- case LM_GGML_OP_MUL:
113
- {
114
- lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
115
- op_tensor = lm_ggml_mul(ctx, a, w);
116
- } break;
117
- case LM_GGML_OP_DIV:
118
- {
119
- lm_ggml_tensor * a = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, w->ne[0]);
120
- op_tensor = lm_ggml_div(ctx, a, w);
121
- } break;
122
- case LM_GGML_OP_ROPE:
123
- {
124
- int n_embd_head = hparams.n_embd_head_v;
125
- int n_head = hparams.n_head();
126
- lm_ggml_tensor * a = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, n_embd_head, n_head, 512);
127
- lm_ggml_tensor * b = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, 512);
128
- op_tensor = lm_ggml_rope_ext(
129
- ctx, a, b, w,
130
- 0, 0, 0, 0, 0,
131
- 0, 0, 0, 0
132
- );
133
-
134
- } break;
135
- case LM_GGML_OP_SSM_CONV:
136
- {
137
- // FIXME
138
- lm_ggml_tensor * conv_x = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, 12345, w->ne[1], 6789);
139
- op_tensor = lm_ggml_ssm_conv(ctx, conv_x, w);
140
- } break;
141
- case LM_GGML_OP_SSM_SCAN:
142
- {
143
- // FIXME
144
- const int64_t d_state = w->ne[0];
145
- const int64_t d_inner = w->ne[1];
146
- const int64_t n_seq_tokens = 512;
147
- const int64_t n_seqs = 1;
148
- lm_ggml_tensor * s = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, d_inner, n_seqs);
149
- lm_ggml_tensor * x = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
150
- lm_ggml_tensor * dt = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
151
- lm_ggml_tensor * B = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
152
- lm_ggml_tensor * C = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
153
- op_tensor = lm_ggml_ssm_scan(ctx, s, x, dt, w, B, C);
154
- } break;
155
- case LM_GGML_OP_RWKV_WKV6:
156
- {
157
- // FIXME
158
- const int64_t S = 123;
159
- const int64_t H = 123;
160
- const int64_t n_tokens = 123;
161
- const int64_t n_seqs = 123;
162
- lm_ggml_tensor * k = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, 1, H, n_tokens);
163
- lm_ggml_tensor * v = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
164
- lm_ggml_tensor * r = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
165
- lm_ggml_tensor * tf = w;
166
- lm_ggml_tensor * td = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
167
- lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
168
- op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
169
- } break;
170
- case LM_GGML_OP_IM2COL:
171
- {
172
- const int n_embd = hparams.n_embd;
173
- lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
174
- op_tensor = lm_ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, LM_GGML_TYPE_F16);
175
- } break;
176
- default:
177
- LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
178
- }
179
-
180
- // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
181
- LM_GGML_ASSERT(w->buffer == nullptr);
182
- w->buffer = lm_ggml_backend_buft_alloc_buffer(buft, 0);
183
- bool op_supported = lm_ggml_backend_dev_supports_op(dev, op_tensor);
184
- lm_ggml_backend_buffer_free(w->buffer);
185
- w->buffer = nullptr;
186
-
187
- return op_supported;
188
- }
189
-
190
- // find the first buffer type in the list that can use the tensor
191
- static lm_ggml_backend_buffer_type_t select_weight_buft(const llama_model & model, lm_ggml_tensor * tensor, lm_ggml_op op, const llama_model::buft_list_t & buft_list) {
192
- LM_GGML_ASSERT(!buft_list.empty());
193
- for (const auto & cur : buft_list) {
194
- lm_ggml_backend_dev_t cur_dev = cur.first;
195
- lm_ggml_backend_buffer_type_t cur_buft = cur.second;
196
- if (weight_buft_supported(model.hparams, tensor, op, cur_buft, cur_dev)) {
197
- return cur_buft;
198
- }
199
- }
200
- return nullptr;
201
- }
202
-
203
- // CPU: ACCEL -> CPU extra -> GPU host -> CPU
204
- static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
205
- llama_model::buft_list_t buft_list;
206
-
207
- // add ACCEL buffer types
208
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
209
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
210
- if (lm_ggml_backend_dev_type(dev) == LM_GGML_BACKEND_DEVICE_TYPE_ACCEL) {
211
- auto * buft = lm_ggml_backend_dev_buffer_type(dev);
212
- // skip
213
- if (buft != lm_ggml_backend_cpu_buffer_type()) {
214
- buft_list.emplace_back(dev, buft);
215
- }
216
- }
217
- }
218
-
219
- // add extra buffer types
220
- auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
221
- auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
222
- auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
223
- lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
224
- if (lm_ggml_backend_dev_get_extra_bufts_fn) {
225
- lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
226
- while (extra_bufts && *extra_bufts) {
227
- buft_list.emplace_back(cpu_dev, *extra_bufts);
228
- ++extra_bufts;
229
- }
230
- }
231
-
232
- // add a host buffer type
233
- // storing the tensors in a host buffer is useful when the processing of large batches
234
- // is offloaded to a GPU device, since it reduces the time spent on data transfers
235
- // generally, this will be done using the first device in the list
236
- // a better approach would be to handle this on a weight-by-weight basis using the offload_op
237
- // function of the device to determine if it would benefit from being stored in a host buffer
238
- for (auto * dev : model.devices) {
239
- lm_ggml_backend_buffer_type_t buft = lm_ggml_backend_dev_host_buffer_type(dev);
240
- if (buft) {
241
- buft_list.emplace_back(dev, buft);
242
- break;
243
- }
244
- }
245
-
246
- // add the CPU buffer type
247
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
248
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
249
- if (lm_ggml_backend_dev_type(dev) == LM_GGML_BACKEND_DEVICE_TYPE_CPU) {
250
- buft_list.emplace_back(dev, lm_ggml_backend_dev_buffer_type(dev));
251
- }
252
- }
253
-
254
- return buft_list;
255
- }
256
-
257
- // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
258
- static llama_model::buft_list_t make_gpu_buft_list(lm_ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
259
- llama_model::buft_list_t buft_list;
260
-
261
- // add the device split buffer type if requested and available
262
- if (split_mode == LLAMA_SPLIT_MODE_ROW) {
263
- lm_ggml_backend_reg_t reg = lm_ggml_backend_dev_backend_reg(dev);
264
- auto lm_ggml_backend_split_buffer_type_fn = (lm_ggml_backend_split_buffer_type_t)
265
- lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_split_buffer_type");
266
- if (lm_ggml_backend_split_buffer_type_fn) {
267
- size_t dev_index = [&]() {
268
- auto * reg = lm_ggml_backend_dev_backend_reg(dev);
269
- for (size_t i = 0; i < lm_ggml_backend_reg_dev_count(reg); ++i) {
270
- if (lm_ggml_backend_reg_dev_get(reg, i) == dev) {
271
- return i;
272
- }
273
- }
274
- throw std::runtime_error(format("device %s not found in its backend reg", lm_ggml_backend_dev_name(dev)));
275
- }();
276
- auto * buft = lm_ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
277
- if (buft != nullptr) {
278
- buft_list.emplace_back(dev, buft);
279
- }
280
- }
281
- }
282
-
283
- // add the device default buffer type
284
- buft_list.emplace_back(dev, lm_ggml_backend_dev_buffer_type(dev));
285
-
286
- return buft_list;
287
- }
288
-
289
- // Returns false if cancelled by progress_callback
290
- static bool llm_load_tensors(
291
- llama_model_loader & ml,
292
- llama_model & model,
293
- int n_gpu_layers,
294
- enum llama_split_mode split_mode,
295
- int main_gpu,
296
- const float * tensor_split,
297
- bool use_mlock,
298
- llama_progress_callback progress_callback,
299
- void * progress_callback_user_data) {
300
- auto & hparams = model.hparams;
301
-
302
- model.split_mode = split_mode;
303
- model.main_gpu = main_gpu;
304
- model.n_gpu_layers = n_gpu_layers;
305
-
306
- const int n_layer = hparams.n_layer;
307
-
308
- bool use_mmap_buffer = true;
309
-
310
- // build a list of buffer types for the CPU and GPU devices
311
- model.cpu_buft_list = make_cpu_buft_list(model);
312
- for (auto * dev : model.devices) {
313
- llama_model::buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
314
- // add CPU buffer types as a fallback
315
- buft_list.insert(buft_list.end(), model.cpu_buft_list.begin(), model.cpu_buft_list.end());
316
- model.gpu_buft_list.emplace(dev, std::move(buft_list));
317
- }
318
-
319
- // calculate the split points
320
- int device_count = llama_get_device_count(model);
321
- bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
322
- std::vector<float> splits(device_count);
323
- if (all_zero) {
324
- // default split, by free memory
325
- for (int i = 0; i < device_count; ++i) {
326
- lm_ggml_backend_dev_t dev = model.devices[i];
327
- size_t total;
328
- size_t free;
329
- lm_ggml_backend_dev_memory(dev, &free, &total);
330
- splits[i] = free;
331
- }
332
- } else {
333
- std::copy(tensor_split, tensor_split + device_count, splits.begin());
334
- }
335
-
336
- // sum and normalize the splits to get the split points
337
- float split_sum = 0.0f;
338
- for (int i = 0; i < device_count; ++i) {
339
- split_sum += splits[i];
340
- splits[i] = split_sum;
341
- }
342
- for (int i = 0; i < device_count; ++i) {
343
- splits[i] /= split_sum;
344
- }
345
-
346
- lm_ggml_backend_dev_t cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
347
- const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
348
- const int act_gpu_layers = model.devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
349
- auto get_layer_buft_list = [&](int il) -> llama_model::layer_dev {
350
- if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
351
- return {cpu_dev, &model.cpu_buft_list};
352
- }
353
- int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
354
- auto * dev = model.devices.at(layer_gpu);
355
- return {dev, &model.gpu_buft_list.at(dev)};
356
- };
357
-
358
- // assign the input layer
359
- // there is very little benefit to offloading the input layer, so always keep it on the CPU
360
- model.dev_input = { cpu_dev, &model.cpu_buft_list };
361
-
362
- // assign the repeating layers to the devices according to the splits
363
- model.dev_layer.resize(n_layer);
364
- for (int il = 0; il < n_layer; ++il) {
365
- model.dev_layer[il] = get_layer_buft_list(il);
366
- }
367
- // assign the output layer
368
- model.dev_output = get_layer_buft_list(n_layer);
369
-
370
- // one ggml context per buffer type
371
- int max_n_tensors = ml.n_tensors;
372
- max_n_tensors += 1; // duplicated output tensor
373
- max_n_tensors += n_layer*2; // duplicated rope freq tensors
374
- const size_t ctx_size = lm_ggml_tensor_overhead()*max_n_tensors;
375
-
376
- std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
377
- auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
378
- auto it = ctx_map.find(buft);
379
- if (it == ctx_map.end()) {
380
- lm_ggml_init_params params = {
381
- /*.mem_size =*/ ctx_size,
382
- /*.mem_buffer =*/ NULL,
383
- /*.no_alloc =*/ true,
384
- };
385
- lm_ggml_context * ctx = lm_ggml_init(params);
386
- if (!ctx) {
387
- throw std::runtime_error(format("failed to create ggml context"));
388
- }
389
- ctx_map[buft] = ctx;
390
- model.ctxs.emplace_back(ctx);
391
- return ctx;
392
- }
393
- return it->second;
394
- };
395
-
396
- // create tensors for the weights
397
- {
398
- // note: cast to int64_t since we will use these for the tensor dimensions
399
- const int64_t n_head = hparams.n_head();
400
- const int64_t n_head_kv = hparams.n_head_kv();
401
- const int64_t n_embd = hparams.n_embd;
402
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
403
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
404
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
405
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
406
- const int64_t n_ff = hparams.n_ff();
407
- const int64_t n_embd_gqa = n_embd_v_gqa;
408
- const int64_t n_vocab = hparams.n_vocab;
409
- const int64_t n_vocab_type = hparams.n_vocab_type;
410
- const int64_t n_rot = hparams.n_rot;
411
- const int64_t n_expert = hparams.n_expert;
412
- const int64_t n_expert_used = hparams.n_expert_used;
413
- const int64_t n_ctx_train = hparams.n_ctx_train;
414
-
415
- if (n_expert > 0 && hparams.n_expert_used == 0) {
416
- throw std::runtime_error("model has expert layers but no expert layers are used");
417
- }
418
-
419
- int n_moved_tensors = 0;
420
- lm_ggml_tensor * first_moved_tensor = nullptr;
421
- lm_ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
422
- lm_ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
423
-
424
- auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> lm_ggml_tensor * {
425
- lm_ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
426
-
427
- if (!t_meta) {
428
- if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) {
429
- return nullptr;
430
- }
431
- throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
432
- }
433
-
434
- // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
435
- // the tensor is duplicated
436
- // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
437
- llm_tensor tn_tensor = tn.tensor;
438
- if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & llama_model_loader::TENSOR_DUPLICATED) {
439
- tn_tensor = LLM_TENSOR_OUTPUT;
440
- }
441
-
442
- llm_tensor_info info;
443
- try {
444
- info = llm_tensor_info_for(tn_tensor);
445
- } catch (const std::out_of_range & e) {
446
- throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
447
- }
448
-
449
- // tensors with "bias" suffix are always used with LM_GGML_OP_ADD
450
- lm_ggml_op op;
451
- bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
452
- if (bias) {
453
- op = LM_GGML_OP_ADD;
454
- } else {
455
- op = info.op;
456
- }
457
-
458
- // sanity checks
459
- if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
460
- if (tn.bid != -1) {
461
- LM_GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
462
- }
463
- } else {
464
- if (tn.bid == -1) {
465
- LM_GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
466
- }
467
- }
468
-
469
- // select the buffer type for this tensor
470
- llama_model::buft_list_t * buft_list;
471
- switch (info.layer) {
472
- case LLM_TENSOR_LAYER_INPUT:
473
- buft_list = model.dev_input.buft_list;
474
- break;
475
- case LLM_TENSOR_LAYER_OUTPUT:
476
- buft_list = model.dev_output.buft_list;
477
- break;
478
- case LLM_TENSOR_LAYER_REPEATING:
479
- buft_list = model.dev_layer.at(tn.bid).buft_list;
480
- break;
481
- default:
482
- LM_GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
483
- }
484
-
485
- lm_ggml_backend_buffer_type_t buft = select_weight_buft(model, t_meta, op, *buft_list);
486
- if (!buft) {
487
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
488
- }
489
-
490
- // avoid using a host buffer when using mmap
491
- auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
492
- if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
493
- auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
494
- buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
495
- }
496
-
497
- if (buft != buft_list->front().second) {
498
- n_moved_tensors++;
499
- if (!first_moved_tensor) {
500
- first_moved_tensor = t_meta;
501
- first_moved_from_buft = buft_list->front().second;
502
- first_moved_to_buft = buft;
503
- }
504
- }
505
-
506
- lm_ggml_context * ctx = ctx_for_buft(buft);
507
-
508
- // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
509
- if (flags & llama_model_loader::TENSOR_DUPLICATED) {
510
- lm_ggml_tensor * t = lm_ggml_get_tensor(ctx, tn.str().c_str());
511
- if (t) {
512
- return t;
513
- }
514
- }
515
- return ml.create_tensor(ctx, tn, ne, flags);
516
- };
517
-
518
- model.layers.resize(n_layer);
519
-
520
- // TODO: move to a separate function
521
- const auto tn = LLM_TN(model.arch);
522
- switch (model.arch) {
523
- case LLM_ARCH_LLAMA:
524
- case LLM_ARCH_REFACT:
525
- case LLM_ARCH_MINICPM:
526
- case LLM_ARCH_GRANITE:
527
- case LLM_ARCH_GRANITE_MOE:
528
- {
529
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
530
-
531
- // output
532
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
533
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
534
-
535
- // if output is NULL, init from the input tok embed
536
- if (model.output == NULL) {
537
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
538
- }
539
-
540
- for (int i = 0; i < n_layer; ++i) {
541
- auto & layer = model.layers[i];
542
-
543
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
544
-
545
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
546
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
547
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
548
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
549
-
550
- // optional bias tensors
551
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
552
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
553
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
554
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
555
-
556
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
557
-
558
- if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
559
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
560
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
561
- }
562
- else {
563
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
564
- }
565
-
566
- if (n_expert == 0) {
567
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
568
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
569
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
570
-
571
- // optional MLP bias
572
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
573
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
574
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
575
- } else {
576
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
577
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
578
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
579
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
580
- }
581
- }
582
- } break;
583
- case LLM_ARCH_DECI:
584
- {
585
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
586
-
587
- // output
588
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
589
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
590
-
591
- // if output is NULL, init from the input tok embed
592
- if (model.output == NULL) {
593
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
594
- }
595
-
596
- for (int i = 0; i < n_layer; ++i) {
597
- auto & layer = model.layers[i];
598
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
599
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
600
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
601
- const int64_t n_ff = hparams.n_ff(i);
602
- const int64_t n_head = hparams.n_head(i);
603
- const int64_t n_head_kv = hparams.n_head_kv(i);
604
-
605
- if (n_head_kv == 0 && n_head > 0) {
606
- // linear attention for DeciLMCausalModel
607
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
608
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
609
- }
610
- else if (n_head_kv > 0) {
611
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
612
-
613
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
614
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
615
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
616
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
617
- }
618
-
619
- // optional bias tensors
620
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
621
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
622
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
623
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
624
-
625
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
626
-
627
- if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
628
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
629
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
630
- }
631
- else {
632
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
633
- }
634
-
635
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
636
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
637
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
638
-
639
- // optional MLP bias
640
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
641
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
642
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
643
- }
644
- } break;
645
- case LLM_ARCH_MINICPM3:
646
- {
647
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
648
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
649
-
650
- const int64_t q_lora_rank = hparams.n_lora_q;
651
- const int64_t kv_lora_rank = hparams.n_lora_kv;
652
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
653
-
654
- // output
655
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
656
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
657
-
658
- // if output is NULL, init from the input tok embed
659
- if (model.output == NULL) {
660
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
661
- }
662
-
663
- for (int i = 0; i < n_layer; ++i) {
664
- auto & layer = model.layers[i];
665
-
666
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
667
- layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
668
-
669
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
670
-
671
- layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
672
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
673
-
674
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
675
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
676
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
677
-
678
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
679
-
680
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
681
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
682
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
683
-
684
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
685
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
686
- }
687
- } break;
688
- case LLM_ARCH_GROK:
689
- {
690
- if (n_expert == 0) {
691
- throw std::runtime_error("Grok model cannot have zero experts");
692
- }
693
-
694
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
695
-
696
- // output
697
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
698
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
699
-
700
- // if output is NULL, init from the input tok embed
701
- if (model.output == NULL) {
702
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
703
- }
704
-
705
- for (int i = 0; i < n_layer; ++i) {
706
- auto & layer = model.layers[i];
707
-
708
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
709
-
710
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
711
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
712
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
713
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
714
-
715
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
716
-
717
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
718
-
719
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
720
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
721
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
722
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
723
-
724
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
725
- }
726
- } break;
727
- case LLM_ARCH_DBRX:
728
- {
729
- if (n_expert == 0) {
730
- throw std::runtime_error("DBRX model cannot have zero experts");
731
- }
732
-
733
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
734
-
735
- // output
736
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
737
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
738
-
739
- for (int i = 0; i < n_layer; ++i) {
740
- auto & layer = model.layers[i];
741
-
742
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
743
-
744
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
745
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
746
-
747
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
748
-
749
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
750
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
751
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
752
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
753
- }
754
- } break;
755
- case LLM_ARCH_BAICHUAN:
756
- {
757
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
758
- {
759
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
760
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
761
- }
762
-
763
- for (int i = 0; i < n_layer; ++i) {
764
- auto & layer = model.layers[i];
765
-
766
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
767
-
768
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
769
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
770
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
771
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
772
-
773
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
774
-
775
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
776
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
777
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
778
- }
779
- } break;
780
- case LLM_ARCH_FALCON:
781
- {
782
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
783
-
784
- // output
785
- {
786
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
787
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
788
-
789
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
790
- if (!model.output) {
791
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
792
- }
793
- }
794
-
795
- for (int i = 0; i < n_layer; ++i) {
796
- auto & layer = model.layers[i];
797
-
798
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
799
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
800
-
801
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
802
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
803
-
804
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
805
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
806
-
807
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
808
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
809
- }
810
- } break;
811
- case LLM_ARCH_STARCODER:
812
- {
813
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
814
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
815
-
816
- // output
817
- {
818
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
819
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
820
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
821
- if (!model.output) {
822
- // needs to be on GPU
823
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
824
- }
825
-
826
- }
827
-
828
- for (int i = 0; i < n_layer; ++i) {
829
- auto & layer = model.layers[i];
830
-
831
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
832
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
833
-
834
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
835
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
836
-
837
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
838
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
839
-
840
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
841
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
842
-
843
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
844
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
845
-
846
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
847
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
848
- }
849
- } break;
850
- case LLM_ARCH_BERT:
851
- case LLM_ARCH_NOMIC_BERT:
852
- {
853
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
854
- model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0);
855
-
856
- if (model.arch == LLM_ARCH_BERT) {
857
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
858
-
859
- model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
860
- model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
861
-
862
- model.cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
863
- model.cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
864
- }
865
-
866
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
867
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
868
-
869
- for (int i = 0; i < n_layer; ++i) {
870
- auto & layer = model.layers[i];
871
-
872
- if (model.arch == LLM_ARCH_BERT) {
873
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
874
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
875
-
876
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
877
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
878
-
879
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
880
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
881
- } else {
882
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
883
- }
884
-
885
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
886
-
887
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
888
- layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
889
-
890
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
891
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
892
-
893
- if (model.arch == LLM_ARCH_BERT) {
894
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
895
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
896
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
897
- } else {
898
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
899
- }
900
-
901
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
902
- layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
903
- }
904
- } break;
905
- case LLM_ARCH_JINA_BERT_V2:
906
- {
907
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
908
- model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); // token_type_embeddings
909
-
910
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
911
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
912
-
913
- model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
914
- model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
915
- for (int i = 0; i < n_layer; ++i) {
916
- auto & layer = model.layers[i]; // JinaBertLayer
917
-
918
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
919
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
920
-
921
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
922
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
923
-
924
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
925
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
926
-
927
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
928
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
929
-
930
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
931
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
932
-
933
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
934
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
935
-
936
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
937
- layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
938
-
939
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
940
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
941
-
942
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
943
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
944
-
945
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
946
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
947
-
948
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
949
- layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
950
- }
951
- } break;
952
- case LLM_ARCH_BLOOM:
953
- {
954
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
955
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
956
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
957
-
958
- // output
959
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
960
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
961
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
962
-
963
- for (int i = 0; i < n_layer; ++i) {
964
- auto & layer = model.layers[i];
965
-
966
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
967
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
968
-
969
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
970
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
971
-
972
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
973
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
974
-
975
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
976
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
977
-
978
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
979
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
980
-
981
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
982
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
983
- }
984
- } break;
985
- case LLM_ARCH_MPT:
986
- {
987
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
988
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
989
-
990
- // output
991
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
992
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
993
-
994
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
995
- if (!model.output) {
996
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
997
- }
998
-
999
- for (int i = 0; i < n_layer; ++i) {
1000
- auto & layer = model.layers[i];
1001
-
1002
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1003
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1004
-
1005
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1006
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
1007
-
1008
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1009
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1010
-
1011
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1012
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1013
-
1014
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1015
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1016
-
1017
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1018
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
1019
-
1020
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1021
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1022
-
1023
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1024
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1025
-
1026
- // AWQ ScaleActivation layer
1027
- layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
1028
- }
1029
- } break;
1030
- case LLM_ARCH_STABLELM:
1031
- {
1032
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1033
-
1034
- // output
1035
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1036
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1037
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1038
-
1039
- for (int i = 0; i < n_layer; ++i) {
1040
- auto & layer = model.layers[i];
1041
-
1042
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1043
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1044
-
1045
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1046
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1047
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1048
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1049
-
1050
- // optional bias tensors, present in Stable LM 2 1.6B
1051
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1052
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
1053
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
1054
-
1055
- // optional q and k layernorms, present in StableLM 2 12B
1056
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
1057
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
1058
-
1059
- // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
1060
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1061
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
1062
-
1063
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1064
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1065
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1066
- }
1067
- } break;
1068
- case LLM_ARCH_QWEN:
1069
- {
1070
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1071
-
1072
- // output
1073
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1074
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1075
-
1076
- for (int i = 0; i < n_layer; ++i) {
1077
- auto & layer = model.layers[i];
1078
-
1079
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1080
-
1081
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
1082
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
1083
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1084
-
1085
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1086
-
1087
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
1088
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
1089
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
1090
- }
1091
- } break;
1092
- case LLM_ARCH_QWEN2:
1093
- case LLM_ARCH_QWEN2VL:
1094
- {
1095
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1096
-
1097
- // output
1098
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1099
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
1100
- // if output is NULL, init from the input tok embed
1101
- if (model.output == NULL) {
1102
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1103
- }
1104
-
1105
- for (int i = 0; i < n_layer; ++i) {
1106
- auto & layer = model.layers[i];
1107
-
1108
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1109
-
1110
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1111
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1112
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1113
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1114
-
1115
- // optional bias tensors
1116
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
1117
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
1118
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
1119
-
1120
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1121
-
1122
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1123
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1124
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1125
- }
1126
- } break;
1127
- case LLM_ARCH_QWEN2MOE:
1128
- {
1129
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1130
-
1131
- // output
1132
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1133
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1134
-
1135
- for (int i = 0; i < n_layer; ++i) {
1136
- auto & layer = model.layers[i];
1137
-
1138
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1139
-
1140
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1141
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1142
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1143
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1144
-
1145
- // optional bias tensors
1146
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
1147
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
1148
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
1149
-
1150
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1151
-
1152
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1153
-
1154
- if (n_expert == 0) {
1155
- throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
1156
- }
1157
- if (n_expert_used == 0) {
1158
- throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
1159
- }
1160
-
1161
- // MoE branch
1162
- const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
1163
-
1164
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
1165
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
1166
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
1167
-
1168
- // Shared expert branch
1169
- const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
1170
-
1171
- layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
1172
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1173
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
1174
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1175
- }
1176
- } break;
1177
- case LLM_ARCH_PHI2:
1178
- {
1179
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1180
-
1181
- // output
1182
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1183
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1184
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1185
- model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
1186
-
1187
- for (int i = 0; i < n_layer; ++i) {
1188
- auto & layer = model.layers[i];
1189
-
1190
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1191
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1192
-
1193
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
1194
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
1195
-
1196
- if (layer.wqkv == nullptr) {
1197
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1198
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
1199
-
1200
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1201
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
1202
-
1203
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1204
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
1205
- }
1206
-
1207
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1208
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1209
-
1210
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1211
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1212
-
1213
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1214
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1215
- }
1216
- } break;
1217
- case LLM_ARCH_PHI3:
1218
- {
1219
- const int64_t n_embd_head = n_embd / n_head;
1220
-
1221
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
1222
-
1223
- // output
1224
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
1225
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
1226
-
1227
- for (int i = 0; i < n_layer; ++i) {
1228
- auto & layer = model.layers[i];
1229
-
1230
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
1231
-
1232
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
1233
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
1234
-
1235
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
1236
-
1237
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
1238
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
1239
-
1240
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
1241
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
1242
- }
1243
- } break;
1244
- case LLM_ARCH_PLAMO:
1245
- {
1246
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1247
-
1248
- // output
1249
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1250
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1251
-
1252
- for (int i = 0; i < n_layer; ++i) {
1253
- auto & layer = model.layers[i];
1254
-
1255
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1256
-
1257
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1258
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1259
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1260
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1261
-
1262
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1263
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1264
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1265
- }
1266
- } break;
1267
- case LLM_ARCH_GPT2:
1268
- {
1269
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1270
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
1271
-
1272
- // output
1273
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1274
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1275
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1276
-
1277
- for (int i = 0; i < n_layer; ++i) {
1278
- auto & layer = model.layers[i];
1279
-
1280
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1281
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1282
-
1283
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1284
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
1285
-
1286
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1287
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1288
-
1289
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1290
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
1291
-
1292
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1293
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1294
-
1295
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1296
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1297
- }
1298
- } break;
1299
- case LLM_ARCH_CODESHELL:
1300
- {
1301
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1302
-
1303
- // output
1304
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1305
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1306
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1307
-
1308
- for (int i = 0; i < n_layer; ++i) {
1309
- auto & layer = model.layers[i];
1310
-
1311
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1312
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1313
-
1314
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1315
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
1316
-
1317
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1318
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1319
-
1320
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1321
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
1322
-
1323
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1324
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1325
-
1326
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1327
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1328
- }
1329
- } break;
1330
- case LLM_ARCH_ORION:
1331
- {
1332
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1333
-
1334
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1335
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1336
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1337
-
1338
- for (int i = 0; i < n_layer; ++i) {
1339
- auto & layer = model.layers[i];
1340
-
1341
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1342
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1343
-
1344
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1345
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1346
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1347
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1348
-
1349
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1350
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
1351
-
1352
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1353
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1354
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1355
- }
1356
- } break;
1357
- case LLM_ARCH_INTERNLM2:
1358
- {
1359
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1360
-
1361
- // output
1362
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1363
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1364
-
1365
- for (int i = 0; i < n_layer; ++i) {
1366
- auto & layer = model.layers[i];
1367
-
1368
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1369
- // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1370
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1371
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1372
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1373
-
1374
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1375
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1376
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1377
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1378
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1379
- }
1380
- } break;
1381
- case LLM_ARCH_GEMMA:
1382
- {
1383
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1384
-
1385
- // output
1386
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1387
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
1388
-
1389
- for (int i = 0; i < n_layer; ++i) {
1390
- auto & layer = model.layers[i];
1391
-
1392
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1393
-
1394
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1395
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1396
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1397
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1398
-
1399
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1400
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1401
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1402
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1403
- }
1404
- } break;
1405
- case LLM_ARCH_GEMMA2:
1406
- {
1407
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1408
-
1409
- // output
1410
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1411
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
1412
-
1413
- for (int i = 0; i < n_layer; ++i) {
1414
- auto & layer = model.layers[i];
1415
-
1416
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1417
-
1418
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1419
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1420
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1421
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1422
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
1423
-
1424
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1425
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1426
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1427
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1428
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
1429
- }
1430
- } break;
1431
- case LLM_ARCH_STARCODER2:
1432
- {
1433
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1434
-
1435
- // output
1436
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1437
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1438
-
1439
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
1440
- // if output is NULL, init from the input tok embed
1441
- if (model.output == NULL) {
1442
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1443
- }
1444
-
1445
- for (int i = 0; i < n_layer; ++i) {
1446
- auto & layer = model.layers[i];
1447
-
1448
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1449
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1450
-
1451
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1452
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1453
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1454
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1455
-
1456
- // optional bias tensors
1457
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
1458
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
1459
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
1460
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1461
-
1462
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1463
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
1464
-
1465
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1466
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1467
-
1468
- // optional bias tensors
1469
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1470
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
1471
- }
1472
- } break;
1473
- case LLM_ARCH_MAMBA:
1474
- {
1475
- const int64_t d_conv = hparams.ssm_d_conv;
1476
- const int64_t d_inner = hparams.ssm_d_inner;
1477
- const int64_t d_state = hparams.ssm_d_state;
1478
- const int64_t dt_rank = hparams.ssm_dt_rank;
1479
-
1480
- // only an expansion factor of 2 is supported for now
1481
- if (2 * n_embd != d_inner) {
1482
- throw std::runtime_error("only an expansion factor of 2 is supported for now");
1483
- }
1484
-
1485
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1486
-
1487
- // output
1488
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1489
-
1490
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
1491
- // if output is NULL, init from the input tok embed, duplicated to allow offloading
1492
- if (model.output == NULL) {
1493
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1494
- }
1495
-
1496
- for (int i = 0; i < n_layer; ++i) {
1497
- auto & layer = model.layers[i];
1498
-
1499
- // norm
1500
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1501
-
1502
- layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
1503
-
1504
- layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
1505
- layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
1506
-
1507
- layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
1508
-
1509
- layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
1510
- layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
1511
-
1512
- // no "weight" suffix for these
1513
- layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
1514
- layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
1515
-
1516
- // out_proj
1517
- layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
1518
- }
1519
- } break;
1520
- case LLM_ARCH_XVERSE:
1521
- {
1522
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1523
-
1524
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1525
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1526
-
1527
- for (int i = 0; i < n_layer; ++i) {
1528
- auto & layer = model.layers[i];
1529
-
1530
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1531
-
1532
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1533
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1534
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1535
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1536
-
1537
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1538
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1539
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1540
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1541
- }
1542
- } break;
1543
- case LLM_ARCH_COMMAND_R:
1544
- {
1545
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1546
-
1547
- // output
1548
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1549
- // init output from the input tok embed
1550
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1551
-
1552
- for (int i = 0; i < n_layer; ++i) {
1553
- auto & layer = model.layers[i];
1554
-
1555
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1556
-
1557
- if (n_layer >= 64){
1558
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
1559
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
1560
- }
1561
-
1562
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1563
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1564
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1565
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1566
-
1567
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1568
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1569
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1570
- }
1571
- } break;
1572
- case LLM_ARCH_COHERE2:
1573
- {
1574
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
1575
-
1576
- // output
1577
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
1578
- // init output from the input tok embed
1579
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
1580
- llama_model_loader::TENSOR_DUPLICATED);
1581
-
1582
- for (int i = 0; i < n_layer; ++i) {
1583
- auto & layer = model.layers[i];
1584
-
1585
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
1586
-
1587
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
1588
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
1589
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
1590
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
1591
-
1592
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
1593
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
1594
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
1595
- }
1596
- }
1597
- break;
1598
- case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
1599
- {
1600
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1601
-
1602
- // output
1603
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
1604
- // if output is NULL, init from the input tok embed
1605
- if (model.output == NULL) {
1606
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1607
- }
1608
-
1609
- for (int i = 0; i < n_layer; ++i) {
1610
- auto & layer = model.layers[i];
1611
-
1612
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1613
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1614
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1615
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1616
-
1617
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1618
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1619
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1620
- }
1621
- } break;
1622
- case LLM_ARCH_OLMO2:
1623
- {
1624
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1625
-
1626
- // output
1627
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1628
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1629
-
1630
- for (int i = 0; i < n_layer; ++i) {
1631
- auto & layer = model.layers[i];
1632
-
1633
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1634
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1635
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1636
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1637
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
1638
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
1639
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
1640
-
1641
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1642
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1643
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1644
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
1645
- }
1646
- } break;
1647
- case LLM_ARCH_OLMOE:
1648
- {
1649
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1650
-
1651
- // output
1652
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1653
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1654
-
1655
- for (int i = 0; i < n_layer; ++i) {
1656
- auto & layer = model.layers[i];
1657
-
1658
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1659
-
1660
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1661
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1662
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1663
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1664
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
1665
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
1666
-
1667
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1668
-
1669
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1670
-
1671
- if (n_expert == 0) {
1672
- throw std::runtime_error("n_expert must be > 0");
1673
- }
1674
- if (n_expert_used == 0) {
1675
- throw std::runtime_error("n_expert_used must be > 0");
1676
- }
1677
-
1678
- // MoE branch
1679
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1680
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
1681
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1682
- }
1683
- } break;
1684
- case LLM_ARCH_OPENELM:
1685
- {
1686
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1687
-
1688
- // output
1689
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1690
- // init output from the input tok embed
1691
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1692
-
1693
- for (int i = 0; i < n_layer; ++i) {
1694
- const int64_t n_head = hparams.n_head(i);
1695
- const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
1696
- const int64_t n_ff = hparams.n_ff(i);
1697
-
1698
- auto & layer = model.layers[i];
1699
-
1700
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1701
-
1702
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
1703
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
1704
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
1705
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
1706
-
1707
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1708
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1709
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1710
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1711
- }
1712
- } break;
1713
- case LLM_ARCH_GPTNEOX:
1714
- {
1715
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1716
-
1717
- // output
1718
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1719
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
1720
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1721
-
1722
- for (int i = 0; i < n_layer; ++i) {
1723
- auto & layer = model.layers[i];
1724
-
1725
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1726
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
1727
-
1728
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1729
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
1730
-
1731
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1732
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1733
-
1734
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1735
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
1736
-
1737
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1738
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
1739
-
1740
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1741
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1742
- }
1743
- } break;
1744
- case LLM_ARCH_ARCTIC:
1745
- {
1746
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1747
-
1748
- // output
1749
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1750
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
1751
-
1752
- // if output is NULL, init from the input tok embed
1753
- if (model.output == NULL) {
1754
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1755
- }
1756
-
1757
- for (int i = 0; i < n_layer; ++i) {
1758
- auto & layer = model.layers[i];
1759
-
1760
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1761
-
1762
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1763
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1764
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1765
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1766
-
1767
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1768
-
1769
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
1770
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
1771
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
1772
-
1773
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1774
- layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
1775
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
1776
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1777
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1778
- }
1779
- } break;
1780
- case LLM_ARCH_DEEPSEEK:
1781
- {
1782
-
1783
- const int64_t n_ff_exp = hparams.n_ff_exp;
1784
- const int64_t n_expert_shared = hparams.n_expert_shared;
1785
-
1786
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1787
-
1788
- // output
1789
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1790
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1791
-
1792
- for (int i = 0; i < n_layer; ++i) {
1793
- auto & layer = model.layers[i];
1794
-
1795
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1796
-
1797
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1798
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1799
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1800
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1801
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1802
-
1803
- if (i < (int) hparams.n_layer_dense_lead) {
1804
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1805
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1806
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1807
- } else {
1808
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1809
-
1810
- if (n_expert == 0) {
1811
- throw std::runtime_error("n_expert must be > 0");
1812
- }
1813
- if (n_expert_used == 0) {
1814
- throw std::runtime_error("n_expert_used must be > 0");
1815
- }
1816
-
1817
- // MoE branch
1818
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
1819
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
1820
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
1821
-
1822
- // Shared expert branch
1823
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
1824
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
1825
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
1826
- }
1827
- }
1828
- } break;
1829
- case LLM_ARCH_DEEPSEEK2:
1830
- {
1831
- const bool is_lite = (hparams.n_layer == 27);
1832
-
1833
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
1834
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
1835
-
1836
- const int64_t q_lora_rank = hparams.n_lora_q;
1837
- const int64_t kv_lora_rank = hparams.n_lora_kv;
1838
-
1839
- const int64_t n_ff_exp = hparams.n_ff_exp;
1840
- const int64_t n_expert_shared = hparams.n_expert_shared;
1841
-
1842
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1843
-
1844
- // output
1845
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1846
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
1847
-
1848
- for (int i = 0; i < n_layer; ++i) {
1849
- auto & layer = model.layers[i];
1850
-
1851
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1852
- if (!is_lite) {
1853
- layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
1854
- }
1855
-
1856
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
1857
-
1858
- if (!is_lite) {
1859
- layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
1860
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
1861
- } else {
1862
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1863
- }
1864
-
1865
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
1866
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
1867
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
1868
-
1869
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1870
-
1871
- if (i < (int) hparams.n_layer_dense_lead) {
1872
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1873
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1874
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1875
- } else {
1876
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1877
- layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
1878
-
1879
- if (n_expert == 0) {
1880
- throw std::runtime_error("n_expert must be > 0");
1881
- }
1882
- if (n_expert_used == 0) {
1883
- throw std::runtime_error("n_expert_used must be > 0");
1884
- }
1885
-
1886
- // MoE branch
1887
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
1888
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
1889
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
1890
-
1891
- // Shared expert branch
1892
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
1893
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
1894
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
1895
- }
1896
- }
1897
- } break;
1898
- case LLM_ARCH_BITNET:
1899
- {
1900
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1901
-
1902
- // output
1903
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1904
-
1905
- for (int i = 0; i < n_layer; ++i) {
1906
- auto & layer = model.layers[i];
1907
-
1908
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1909
- layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
1910
-
1911
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
1912
- layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
1913
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
1914
- layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
1915
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
1916
- layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
1917
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1918
- layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
1919
-
1920
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1921
- layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
1922
-
1923
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1924
- layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
1925
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1926
- layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
1927
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1928
- layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
1929
- }
1930
- } break;
1931
- case LLM_ARCH_T5:
1932
- {
1933
- const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
1934
-
1935
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1936
-
1937
- // output
1938
- model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
1939
- model.output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
1940
-
1941
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
1942
- // if output is NULL, init from the input tok embed
1943
- if (model.output == NULL) {
1944
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1945
- }
1946
-
1947
- for (int i = 0; i < n_layer; ++i) {
1948
- auto & layer = model.layers[i];
1949
-
1950
- layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
1951
- layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
1952
-
1953
- layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1954
- layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1955
- layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1956
- layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
1957
-
1958
- layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
1959
- layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
1960
- layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1961
- layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1962
-
1963
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
1964
- layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
1965
-
1966
- layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1967
- layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1968
- layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1969
- layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
1970
-
1971
- layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
1972
- // this tensor seems to be unused in HF transformers implementation
1973
- layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
1974
-
1975
- layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1976
- layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1977
- layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1978
- layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
1979
-
1980
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
1981
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
1982
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1983
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1984
- }
1985
- } break;
1986
- case LLM_ARCH_T5ENCODER:
1987
- {
1988
- const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
1989
-
1990
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1991
-
1992
- // output
1993
- model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
1994
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
1995
- // if output is NULL, init from the input tok embed
1996
- if (model.output == NULL) {
1997
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
1998
- }
1999
-
2000
- for (int i = 0; i < n_layer; ++i) {
2001
- auto & layer = model.layers[i];
2002
-
2003
- layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
2004
- layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
2005
-
2006
- layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2007
- layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2008
- layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2009
- layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
2010
-
2011
- layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
2012
- layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
2013
- layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2014
- layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2015
- }
2016
- } break;
2017
- case LLM_ARCH_JAIS:
2018
- {
2019
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2020
-
2021
- // output
2022
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2023
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2024
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2025
-
2026
- for (int i = 0; i < n_layer; ++i) {
2027
- auto & layer = model.layers[i];
2028
-
2029
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2030
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2031
-
2032
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2033
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2034
-
2035
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2036
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2037
-
2038
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2039
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2040
-
2041
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2042
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2043
-
2044
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2045
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
2046
-
2047
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2048
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2049
- }
2050
- } break;
2051
- case LLM_ARCH_CHATGLM:
2052
- {
2053
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2054
-
2055
- // output
2056
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2057
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2058
-
2059
- for (int i = 0; i < n_layer; ++i) {
2060
- auto & layer = model.layers[i];
2061
-
2062
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2063
-
2064
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2065
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2066
-
2067
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2068
-
2069
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2070
-
2071
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
2072
-
2073
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2074
- }
2075
- } break;
2076
- case LLM_ARCH_NEMOTRON:
2077
- {
2078
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2079
-
2080
- // output
2081
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2082
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2083
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2084
-
2085
- for (int i = 0; i < n_layer; ++i) {
2086
- auto & layer = model.layers[i];
2087
-
2088
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2089
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2090
-
2091
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2092
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2093
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2094
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2095
-
2096
- // optional bias tensors
2097
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
2098
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
2099
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
2100
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
2101
-
2102
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2103
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
2104
-
2105
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2106
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2107
-
2108
- // optional MLP bias
2109
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
2110
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
2111
- }
2112
- } break;
2113
- case LLM_ARCH_EXAONE:
2114
- {
2115
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2116
-
2117
- // output
2118
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2119
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2120
-
2121
- for (int i = 0; i < n_layer; ++i) {
2122
- auto & layer = model.layers[i];
2123
-
2124
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2125
-
2126
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2127
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
2128
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
2129
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2130
-
2131
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2132
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
2133
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2134
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2135
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2136
- }
2137
- } break;
2138
- case LLM_ARCH_RWKV6:
2139
- {
2140
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2141
-
2142
- // Block 0, LN0
2143
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
2144
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
2145
-
2146
- // output
2147
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2148
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2149
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2150
-
2151
- const int time_mix_extra_dim = hparams.time_mix_extra_dim;
2152
- const int time_decay_extra_dim = hparams.time_decay_extra_dim;
2153
- const int head_size = hparams.wkv_head_size;
2154
- const int attn_hidden_size = n_embd;
2155
- const int ffn_size = hparams.n_ff_arr[0];
2156
-
2157
- for (int i = 0; i < n_layer; ++i) {
2158
- auto & layer = model.layers[i];
2159
-
2160
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2161
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
2162
-
2163
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
2164
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
2165
-
2166
- layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
2167
- layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
2168
-
2169
- layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
2170
- layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, 0);
2171
- layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
2172
- layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, 0);
2173
- layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
2174
- layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, 0);
2175
-
2176
- layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
2177
- layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
2178
- layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
2179
- layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
2180
- layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
2181
- layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
2182
- layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
2183
- layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
2184
-
2185
- layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
2186
- layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
2187
- layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
2188
-
2189
- layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
2190
- layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
2191
-
2192
- layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
2193
- layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
2194
- layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
2195
- }
2196
-
2197
- } break;
2198
- case LLM_ARCH_CHAMELEON:
2199
- {
2200
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2201
-
2202
- // output
2203
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2204
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
2205
- // if output is NULL, init from the input tok embed
2206
- if (model.output == NULL) {
2207
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
2208
- }
2209
-
2210
- for (int i = 0; i < n_layer; ++i) {
2211
- auto & layer = model.layers[i];
2212
-
2213
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2214
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
2215
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
2216
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
2217
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
2218
-
2219
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2220
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2221
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2222
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2223
-
2224
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2225
-
2226
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2227
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2228
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2229
- }
2230
- } break;
2231
- case LLM_ARCH_WAVTOKENIZER_DEC:
2232
- {
2233
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
2234
-
2235
- model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
2236
- model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
2237
-
2238
- // posnet
2239
- {
2240
- const int64_t n_embd = hparams.posnet.n_embd;
2241
-
2242
- for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
2243
- auto & layer = model.layers[i].posnet;
2244
-
2245
- // posnet:
2246
- //
2247
- // - resnet
2248
- // - resnet
2249
- // - attn
2250
- // - resnet
2251
- // - resnet
2252
- // - norm
2253
- //
2254
- switch (i) {
2255
- case 0:
2256
- case 1:
2257
- case 3:
2258
- case 4:
2259
- {
2260
- layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
2261
- layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
2262
-
2263
- layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
2264
- layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
2265
-
2266
- layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
2267
- layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
2268
-
2269
- layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
2270
- layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
2271
- } break;
2272
- case 2:
2273
- {
2274
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
2275
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
2276
-
2277
- layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
2278
- layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
2279
-
2280
- layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
2281
- layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
2282
-
2283
- layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
2284
- layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
2285
-
2286
- layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
2287
- layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
2288
- } break;
2289
- case 5:
2290
- {
2291
- layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
2292
- layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
2293
- } break;
2294
- default: LM_GGML_ABORT("unknown posnet layer");
2295
- };
2296
- }
2297
- }
2298
-
2299
- LM_GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
2300
-
2301
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
2302
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
2303
-
2304
- // convnext
2305
- {
2306
- const int64_t n_embd = hparams.convnext.n_embd;
2307
-
2308
- for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
2309
- auto & layer = model.layers[i].convnext;
2310
-
2311
- layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
2312
- layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
2313
-
2314
- layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
2315
- layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
2316
-
2317
- layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
2318
- layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
2319
-
2320
- layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
2321
- layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
2322
-
2323
- layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
2324
- }
2325
-
2326
- // output
2327
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2328
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2329
- }
2330
-
2331
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
2332
- model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
2333
- } break;
2334
- default:
2335
- throw std::runtime_error("unknown architecture");
2336
- }
2337
-
2338
- if (n_moved_tensors > 0) {
2339
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
2340
- __func__, first_moved_tensor->name, lm_ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
2341
- lm_ggml_backend_buft_name(first_moved_from_buft), lm_ggml_backend_buft_name(first_moved_to_buft));
2342
- }
2343
- }
2344
-
2345
- ml.done_getting_tensors();
2346
-
2347
- ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
2348
- model.mappings.reserve(ml.mappings.size());
2349
-
2350
- // create the backend buffers
2351
- std::vector<std::pair<lm_ggml_context *, llama_buf_map>> ctx_bufs;
2352
- ctx_bufs.reserve(ctx_map.size());
2353
-
2354
- // Ensure we have enough capacity for the maximum backend buffer we will potentially create
2355
- const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
2356
- model.bufs.reserve(n_max_backend_buffer);
2357
-
2358
- for (auto & it : ctx_map) {
2359
- lm_ggml_backend_buffer_type_t buft = it.first;
2360
- lm_ggml_context * ctx = it.second;
2361
-
2362
- // skip contexts without tensors
2363
- if (lm_ggml_get_first_tensor(ctx) == nullptr) {
2364
- continue;
2365
- }
2366
-
2367
- llama_buf_map bufs;
2368
- bufs.reserve(n_max_backend_buffer);
2369
11
 
2370
- // check if it is possible to use buffer_from_host_ptr with this buffer type
2371
- lm_ggml_backend_dev_t dev = lm_ggml_backend_buft_get_device(buft);
2372
- if (!dev) {
2373
- // FIXME: workaround for CPU backend buft having a NULL device
2374
- dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
2375
- }
2376
- lm_ggml_backend_dev_props props;
2377
- lm_ggml_backend_dev_get_props(dev, &props);
2378
- bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
2379
- bool is_default_buft = buft == lm_ggml_backend_dev_buffer_type(dev);
2380
-
2381
- if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
2382
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
2383
- // only the mmap region containing the tensors in the model is mapped to the backend buffer
2384
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
2385
- // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
2386
- void * addr = nullptr;
2387
- size_t first, last; // NOLINT
2388
- ml.get_mapping_range(&first, &last, &addr, idx, ctx);
2389
- if (first >= last) {
2390
- continue;
2391
- }
2392
- const size_t max_size = lm_ggml_get_max_tensor_size(ctx);
2393
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
2394
- if (buf == nullptr) {
2395
- throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
2396
- }
2397
- model.bufs.emplace_back(buf);
2398
- bufs.emplace(idx, buf);
2399
- }
2400
- }
2401
- else {
2402
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
2403
- if (buf == nullptr) {
2404
- throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
2405
- }
2406
- model.bufs.emplace_back(buf);
2407
- if (use_mlock && lm_ggml_backend_buffer_is_host(buf)) {
2408
- model.mlock_bufs.emplace_back(new llama_mlock);
2409
- auto & mlock_buf = model.mlock_bufs.back();
2410
- mlock_buf->init (lm_ggml_backend_buffer_get_base(buf));
2411
- mlock_buf->grow_to(lm_ggml_backend_buffer_get_size(buf));
2412
- }
2413
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
2414
- bufs.emplace(idx, buf);
2415
- }
2416
- }
2417
-
2418
- if (bufs.empty()) {
2419
- throw std::runtime_error("failed to allocate buffer");
2420
- }
2421
-
2422
- for (auto & buf : bufs) {
2423
- // indicate that this buffer contains weights
2424
- // this is used by lm_ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
2425
- lm_ggml_backend_buffer_set_usage(buf.second, LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
2426
- }
2427
-
2428
- ctx_bufs.emplace_back(ctx, bufs);
2429
- }
2430
-
2431
- if (llama_supports_gpu_offload()) {
2432
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2433
-
2434
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2435
- if (n_gpu_layers > (int) hparams.n_layer) {
2436
- LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
2437
- }
2438
-
2439
- const int max_backend_supported_layers = hparams.n_layer + 1;
2440
- const int max_offloadable_layers = hparams.n_layer + 1;
2441
-
2442
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2443
- }
2444
-
2445
- // print memory requirements per buffer type
2446
- for (auto & buf : model.bufs) {
2447
- LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
2448
- }
2449
-
2450
- // populate tensors_by_name
2451
- for (auto & ctx : model.ctxs) {
2452
- for (auto * cur = lm_ggml_get_first_tensor(ctx.get()); cur != NULL; cur = lm_ggml_get_next_tensor(ctx.get(), cur)) {
2453
- model.tensors_by_name.emplace_back(lm_ggml_get_name(cur), cur);
2454
- }
2455
- }
12
+ #include "ggml.h"
13
+ #include "ggml-alloc.h"
14
+ #include "ggml-backend.h"
15
+ #include "ggml-cpp.h"
2456
16
 
2457
- // load tensor data
2458
- for (auto & it : ctx_bufs) {
2459
- lm_ggml_context * ctx = it.first;
2460
- auto & bufs = it.second;
2461
- if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
2462
- return false;
2463
- }
2464
- }
17
+ #include <algorithm>
18
+ #include <array>
19
+ #include <cassert>
20
+ #include <cfloat>
21
+ #include <cmath>
22
+ #include <cstddef>
23
+ #include <cstdint>
24
+ #include <cstdio>
25
+ #include <cstring>
26
+ #include <ctime>
27
+ #include <functional>
2465
28
 
2466
- if (use_mmap_buffer) {
2467
- for (auto & mapping : ml.mappings) {
2468
- model.mappings.emplace_back(std::move(mapping));
2469
- }
2470
- }
29
+ #if defined(_MSC_VER)
30
+ #pragma warning(disable: 4244 4267) // possible loss of data
31
+ #endif
2471
32
 
2472
- return true;
2473
- }
33
+ #if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
34
+ #include <android/log.h>
35
+ #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
36
+ #undef LLAMA_LOG_INFO
37
+ #undef LLAMA_LOG_WARN
38
+ #undef LLAMA_LOG_ERROR
39
+ #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
40
+ #define LLAMA_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , LLAMA_ANDROID_TAG, __VA_ARGS__)
41
+ #define LLAMA_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, LLAMA_ANDROID_TAG, __VA_ARGS__)
42
+ #endif // __ANDROID__
2474
43
 
2475
44
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
2476
- static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
2477
- model.t_start_us = lm_ggml_time_us();
45
+ static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
46
+ // loading time will be recalculated after the first eval, so
47
+ // we take page faults deferred by mmap() into consideration
48
+ model.t_load_us = 0;
49
+ time_meas tm(model.t_load_us);
50
+
51
+ model.t_start_us = tm.t_start_us;
2478
52
 
2479
53
  try {
2480
- llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
54
+ llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
55
+
56
+ ml.print_info();
2481
57
 
2482
58
  model.hparams.vocab_only = params.vocab_only;
2483
59
 
2484
60
  try {
2485
- llm_load_arch(ml, model);
61
+ model.load_arch(ml);
2486
62
  } catch(const std::exception & e) {
2487
63
  throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
2488
64
  }
2489
65
  try {
2490
- llm_load_hparams(ml, model);
66
+ model.load_hparams(ml);
2491
67
  } catch(const std::exception & e) {
2492
68
  throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
2493
69
  }
2494
70
  try {
2495
- llm_load_vocab(ml, model);
71
+ model.load_vocab(ml);
2496
72
  } catch(const std::exception & e) {
2497
73
  throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
2498
74
  }
2499
75
 
2500
- llm_load_stats(ml, model);
2501
- llm_load_print_meta(ml, model);
2502
-
2503
- if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
2504
- model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2505
- throw std::runtime_error("vocab size mismatch");
2506
- }
76
+ model.load_stats(ml);
77
+ model.print_info();
2507
78
 
2508
79
  if (params.vocab_only) {
2509
80
  LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2510
81
  return 0;
2511
82
  }
2512
83
 
2513
- if (!llm_load_tensors(
2514
- ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
2515
- params.progress_callback, params.progress_callback_user_data
2516
- )) {
84
+ if (!model.load_tensors(ml)) {
2517
85
  return -2;
2518
86
  }
2519
87
  } catch (const std::exception & err) {
@@ -2521,10 +89,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
2521
89
  return -1;
2522
90
  }
2523
91
 
2524
- // loading time will be recalculate after the first eval, so
2525
- // we take page faults deferred by mmap() into consideration
2526
- model.t_load_us = lm_ggml_time_us() - model.t_start_us;
2527
-
2528
92
  return 0;
2529
93
  }
2530
94
 
@@ -2572,16 +136,16 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
2572
136
  inpL = lm_ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
2573
137
 
2574
138
  // apply lora for embedding tokens if needed
2575
- for (auto & it : lctx.lora_adapters) {
2576
- struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
2577
- if (lora == nullptr) {
139
+ for (auto & it : lctx.lora) {
140
+ struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd);
141
+ if (lw == nullptr) {
2578
142
  continue;
2579
143
  }
2580
144
  const float adapter_scale = it.second;
2581
- const float scale = lora->get_scale(it.first->alpha, adapter_scale);
145
+ const float scale = lw->get_scale(it.first->alpha, adapter_scale);
2582
146
  struct lm_ggml_tensor * inpL_delta = lm_ggml_scale(ctx, lm_ggml_mul_mat(
2583
- ctx, lora->b, // non-transposed lora_b
2584
- lm_ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
147
+ ctx, lw->b, // non-transposed lora_b
148
+ lm_ggml_get_rows(ctx, lw->a, lctx.inp_tokens)
2585
149
  ), scale);
2586
150
  inpL = lm_ggml_add(ctx, inpL, inpL_delta);
2587
151
  }
@@ -2652,16 +216,16 @@ static struct lm_ggml_tensor * llm_build_lora_mm(
2652
216
  struct lm_ggml_tensor * w,
2653
217
  struct lm_ggml_tensor * cur) {
2654
218
  struct lm_ggml_tensor * res = lm_ggml_mul_mat(ctx0, w, cur);
2655
- for (auto & it : lctx.lora_adapters) {
2656
- struct llama_lora_weight * lora = it.first->get_weight(w);
2657
- if (lora == nullptr) {
219
+ for (auto & it : lctx.lora) {
220
+ struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
221
+ if (lw == nullptr) {
2658
222
  continue;
2659
223
  }
2660
224
  const float adapter_scale = it.second;
2661
- const float scale = lora->get_scale(it.first->alpha, adapter_scale);
225
+ const float scale = lw->get_scale(it.first->alpha, adapter_scale);
2662
226
  struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat(
2663
- ctx0, lora->b,
2664
- lm_ggml_mul_mat(ctx0, lora->a, cur)
227
+ ctx0, lw->b,
228
+ lm_ggml_mul_mat(ctx0, lw->a, cur)
2665
229
  );
2666
230
  ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
2667
231
  res = lm_ggml_add(ctx0, res, ab_cur);
@@ -2677,17 +241,17 @@ static struct lm_ggml_tensor * llm_build_lora_mm_id(
2677
241
  struct lm_ggml_tensor * cur, // struct lm_ggml_tensor * b
2678
242
  struct lm_ggml_tensor * ids) {
2679
243
  struct lm_ggml_tensor * res = lm_ggml_mul_mat_id(ctx0, w, cur, ids);
2680
- for (auto & it : lctx.lora_adapters) {
2681
- struct llama_lora_weight * lora = it.first->get_weight(w);
2682
- if (lora == nullptr) {
244
+ for (auto & it : lctx.lora) {
245
+ struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
246
+ if (lw == nullptr) {
2683
247
  continue;
2684
248
  }
2685
249
  const float alpha = it.first->alpha;
2686
- const float rank = (float) lora->b->ne[0];
250
+ const float rank = (float) lw->b->ne[0];
2687
251
  const float scale = alpha ? it.second * alpha / rank : it.second;
2688
252
  struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat_id(
2689
- ctx0, lora->b,
2690
- lm_ggml_mul_mat_id(ctx0, lora->a, cur, ids),
253
+ ctx0, lw->b,
254
+ lm_ggml_mul_mat_id(ctx0, lw->a, cur, ids),
2691
255
  ids
2692
256
  );
2693
257
  ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
@@ -3318,16 +882,20 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
3318
882
  const struct llama_layer * layer,
3319
883
  struct lm_ggml_tensor * cur,
3320
884
  struct lm_ggml_tensor * x_prev,
3321
- struct lm_ggml_tensor ** wkv_state) {
885
+ struct lm_ggml_tensor ** wkv_state,
886
+ size_t wkv_head_size,
887
+ size_t head_count_kv) {
3322
888
  size_t n_embd = cur->ne[0];
3323
889
  size_t n_seq_tokens = cur->ne[1];
3324
890
  size_t n_seqs = cur->ne[2];
3325
891
 
3326
- size_t head_size = layer->time_mix_first->ne[0];
3327
- size_t head_count = layer->time_mix_first->ne[1];
892
+ size_t head_size = wkv_head_size;
893
+ size_t head_count = n_embd / head_size;
3328
894
 
3329
895
  size_t n_tokens = n_seqs * n_seq_tokens;
3330
896
 
897
+ bool is_qrwkv = layer->time_mix_first == nullptr;
898
+
3331
899
  struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
3332
900
 
3333
901
  sx = lm_ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
@@ -3356,69 +924,64 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
3356
924
  xxx
3357
925
  );
3358
926
 
3359
- struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
3360
- struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
3361
- struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
3362
- struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
3363
- struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
3364
-
3365
- struct lm_ggml_tensor * xw = lm_ggml_add(
3366
- ctx,
3367
- lm_ggml_mul(
3368
- ctx,
3369
- lm_ggml_add(ctx, mw, layer->time_mix_lerp_w),
3370
- sx
3371
- ),
3372
- cur
3373
- );
927
+ struct lm_ggml_tensor *xw, *xk, *xv, *xr, *xg;
928
+ if (layer->time_mix_lerp_fused) {
929
+ // fusing these weights makes some performance improvement
930
+ sx = lm_ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens);
931
+ cur = lm_ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
932
+ xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur);
933
+ xw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
934
+ xk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
935
+ xv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
936
+ xr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
937
+ xg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
938
+ } else {
939
+ // for backward compatibility
940
+ xw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
941
+ xk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
942
+ xv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
943
+ xr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
944
+ xg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
3374
945
 
3375
- struct lm_ggml_tensor * xk = lm_ggml_add(
3376
- ctx,
3377
- lm_ggml_mul(
3378
- ctx,
3379
- lm_ggml_add(ctx, mk, layer->time_mix_lerp_k),
3380
- sx
3381
- ),
3382
- cur
3383
- );
946
+ xw = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur);
947
+ xk = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur);
948
+ xv = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur);
949
+ xr = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur);
950
+ xg = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur);
951
+ }
3384
952
 
3385
- struct lm_ggml_tensor * xv = lm_ggml_add(
3386
- ctx,
3387
- lm_ggml_mul(
3388
- ctx,
3389
- lm_ggml_add(ctx, mv, layer->time_mix_lerp_v),
3390
- sx
3391
- ),
3392
- cur
3393
- );
953
+ struct lm_ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr);
954
+ struct lm_ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk);
955
+ struct lm_ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv);
956
+ if (layer->time_mix_receptance_b) {
957
+ r = lm_ggml_add(ctx, r, layer->time_mix_receptance_b);
958
+ }
959
+ if (layer->time_mix_key_b) {
960
+ k = lm_ggml_add(ctx, k, layer->time_mix_key_b);
961
+ }
962
+ if (layer->time_mix_value_b) {
963
+ v = lm_ggml_add(ctx, v, layer->time_mix_value_b);
964
+ }
3394
965
 
3395
- struct lm_ggml_tensor * xr = lm_ggml_add(
3396
- ctx,
3397
- lm_ggml_mul(
3398
- ctx,
3399
- lm_ggml_add(ctx, mr, layer->time_mix_lerp_r),
3400
- sx
3401
- ),
3402
- cur
3403
- );
966
+ struct lm_ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg);
967
+ if (is_qrwkv) {
968
+ g = lm_ggml_sigmoid(ctx, g);
969
+ } else {
970
+ g = lm_ggml_silu(ctx, g);
971
+ }
3404
972
 
3405
- struct lm_ggml_tensor * xg = lm_ggml_add(
3406
- ctx,
3407
- lm_ggml_mul(
3408
- ctx,
3409
- lm_ggml_add(ctx, mg, layer->time_mix_lerp_g),
3410
- sx
3411
- ),
3412
- cur
3413
- );
973
+ if (head_count_kv != head_count) {
974
+ LM_GGML_ASSERT(head_count % head_count_kv == 0);
975
+ k = lm_ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens);
976
+ v = lm_ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens);
977
+ struct lm_ggml_tensor * tmp = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
978
+ k = lm_ggml_repeat(ctx, k, tmp);
979
+ v = lm_ggml_repeat(ctx, v, tmp);
980
+ }
3414
981
 
3415
- struct lm_ggml_tensor * r = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
3416
- struct lm_ggml_tensor * k = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
3417
- struct lm_ggml_tensor * v = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
3418
- struct lm_ggml_tensor * g = lm_ggml_silu(
3419
- ctx,
3420
- llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
3421
- );
982
+ k = lm_ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens);
983
+ v = lm_ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens);
984
+ r = lm_ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens);
3422
985
 
3423
986
  struct lm_ggml_tensor * w = lm_ggml_mul_mat(
3424
987
  ctx,
@@ -3429,25 +992,35 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
3429
992
  )
3430
993
  );
3431
994
 
3432
- w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
995
+ w = lm_ggml_add(ctx, w, layer->time_mix_decay);
3433
996
  w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
3434
- w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
997
+ w = lm_ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens);
3435
998
 
3436
- k = lm_ggml_transpose(ctx, k);
3437
- v = lm_ggml_transpose(ctx, v);
3438
- r = lm_ggml_transpose(ctx, r);
999
+ if (is_qrwkv) {
1000
+ // k = k * (1 - w)
1001
+ k = lm_ggml_sub(ctx, k, lm_ggml_mul(ctx, k, w));
1002
+ }
3439
1003
 
3440
- struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
1004
+ struct lm_ggml_tensor * wkv_output;
1005
+ if (!layer->time_mix_first) {
1006
+ wkv_output = lm_ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
1007
+ } else {
1008
+ wkv_output = lm_ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
1009
+ }
3441
1010
  cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
3442
1011
  *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
3443
1012
 
3444
- // group norm with head_count groups
3445
- cur = lm_ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
3446
- cur = lm_ggml_norm(ctx, cur, 64e-5f);
1013
+ if (!is_qrwkv) {
1014
+ // group norm with head_count groups
1015
+ cur = lm_ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
1016
+ cur = lm_ggml_norm(ctx, cur, 64e-5f);
3447
1017
 
3448
- // Convert back to regular vectors.
3449
- cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
3450
- cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
1018
+ // Convert back to regular vectors.
1019
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
1020
+ cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
1021
+ } else {
1022
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
1023
+ }
3451
1024
 
3452
1025
  cur = lm_ggml_mul(ctx, cur, g);
3453
1026
  cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
@@ -3603,7 +1176,7 @@ struct llm_build_context {
3603
1176
  }
3604
1177
 
3605
1178
  struct lm_ggml_cgraph * build_k_shift() {
3606
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1179
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
3607
1180
 
3608
1181
  LM_GGML_ASSERT(kv_self.size == n_ctx);
3609
1182
 
@@ -3653,7 +1226,7 @@ struct llm_build_context {
3653
1226
  }
3654
1227
 
3655
1228
  struct lm_ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
3656
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1229
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
3657
1230
 
3658
1231
  for (uint32_t i = 0; i < ids.size(); ++i) {
3659
1232
  const uint32_t id = ids[i];
@@ -3912,7 +1485,7 @@ struct llm_build_context {
3912
1485
  }
3913
1486
 
3914
1487
  struct lm_ggml_cgraph * build_llama() {
3915
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1488
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
3916
1489
 
3917
1490
  // mutable variable, needed during the last layer of the computation to skip unused tokens
3918
1491
  int32_t n_tokens = this->n_tokens;
@@ -4078,7 +1651,7 @@ struct llm_build_context {
4078
1651
  }
4079
1652
 
4080
1653
  struct lm_ggml_cgraph * build_deci() {
4081
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1654
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4082
1655
 
4083
1656
  // mutable variable, needed during the last layer of the computation to skip unused tokens
4084
1657
  int32_t n_tokens = this->n_tokens;
@@ -4239,7 +1812,7 @@ struct llm_build_context {
4239
1812
  }
4240
1813
 
4241
1814
  struct lm_ggml_cgraph * build_baichuan() {
4242
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1815
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4243
1816
 
4244
1817
  const int64_t n_embd_head = hparams.n_embd_head_v;
4245
1818
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -4251,7 +1824,7 @@ struct llm_build_context {
4251
1824
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
4252
1825
 
4253
1826
  // inp_pos - contains the positions
4254
- struct lm_ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
1827
+ struct lm_ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
4255
1828
 
4256
1829
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4257
1830
  struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -4276,7 +1849,7 @@ struct llm_build_context {
4276
1849
  cb(Vcur, "Vcur", il);
4277
1850
 
4278
1851
  switch (model.type) {
4279
- case MODEL_7B:
1852
+ case LLM_TYPE_7B:
4280
1853
  Qcur = lm_ggml_rope_ext(
4281
1854
  ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
4282
1855
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -4288,7 +1861,7 @@ struct llm_build_context {
4288
1861
  ext_factor, attn_factor, beta_fast, beta_slow
4289
1862
  );
4290
1863
  break;
4291
- case MODEL_13B:
1864
+ case LLM_TYPE_13B:
4292
1865
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
4293
1866
  Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
4294
1867
  break;
@@ -4354,7 +1927,7 @@ struct llm_build_context {
4354
1927
  }
4355
1928
 
4356
1929
  struct lm_ggml_cgraph * build_xverse() {
4357
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1930
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4358
1931
 
4359
1932
  const int64_t n_embd_head = hparams.n_embd_head_v;
4360
1933
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -4457,7 +2030,7 @@ struct llm_build_context {
4457
2030
  }
4458
2031
 
4459
2032
  struct lm_ggml_cgraph * build_falcon() {
4460
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2033
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4461
2034
 
4462
2035
  const int64_t n_embd_head = hparams.n_embd_head_v;
4463
2036
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -4577,7 +2150,7 @@ struct llm_build_context {
4577
2150
  }
4578
2151
 
4579
2152
  struct lm_ggml_cgraph * build_grok() {
4580
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2153
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4581
2154
 
4582
2155
  // mutable variable, needed during the last layer of the computation to skip unused tokens
4583
2156
  int32_t n_tokens = this->n_tokens;
@@ -4736,7 +2309,7 @@ struct llm_build_context {
4736
2309
  }
4737
2310
 
4738
2311
  struct lm_ggml_cgraph * build_dbrx() {
4739
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2312
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4740
2313
 
4741
2314
  // mutable variable, needed during the last layer of the computation to skip unused tokens
4742
2315
  int32_t n_tokens = this->n_tokens;
@@ -4864,7 +2437,7 @@ struct llm_build_context {
4864
2437
  }
4865
2438
 
4866
2439
  struct lm_ggml_cgraph * build_starcoder() {
4867
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2440
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4868
2441
 
4869
2442
  const int64_t n_embd_head = hparams.n_embd_head_v;
4870
2443
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -4968,7 +2541,7 @@ struct llm_build_context {
4968
2541
  }
4969
2542
 
4970
2543
  struct lm_ggml_cgraph * build_refact() {
4971
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2544
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
4972
2545
 
4973
2546
  const int64_t n_embd_head = hparams.n_embd_head_v;
4974
2547
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5062,7 +2635,7 @@ struct llm_build_context {
5062
2635
  }
5063
2636
 
5064
2637
  struct lm_ggml_cgraph * build_bert() {
5065
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2638
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
5066
2639
 
5067
2640
  const int64_t n_embd_head = hparams.n_embd_head_v;
5068
2641
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -5256,7 +2829,7 @@ struct llm_build_context {
5256
2829
  }
5257
2830
 
5258
2831
  struct lm_ggml_cgraph * build_bloom() {
5259
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2832
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
5260
2833
 
5261
2834
  const int64_t n_embd_head = hparams.n_embd_head_v;
5262
2835
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -5357,7 +2930,7 @@ struct llm_build_context {
5357
2930
  }
5358
2931
 
5359
2932
  struct lm_ggml_cgraph * build_mpt() {
5360
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
2933
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
5361
2934
 
5362
2935
  const int64_t n_embd_head = hparams.n_embd_head_v;
5363
2936
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -5647,7 +3220,7 @@ struct llm_build_context {
5647
3220
  }
5648
3221
 
5649
3222
  struct lm_ggml_cgraph * build_qwen() {
5650
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
3223
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
5651
3224
 
5652
3225
  const int64_t n_embd_head = hparams.n_embd_head_v;
5653
3226
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5759,7 +3332,7 @@ struct llm_build_context {
5759
3332
  }
5760
3333
 
5761
3334
  struct lm_ggml_cgraph * build_qwen2() {
5762
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
3335
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
5763
3336
 
5764
3337
  const int64_t n_embd_head = hparams.n_embd_head_v;
5765
3338
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5871,7 +3444,7 @@ struct llm_build_context {
5871
3444
  }
5872
3445
 
5873
3446
  struct lm_ggml_cgraph * build_qwen2vl() {
5874
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
3447
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
5875
3448
  const int64_t n_embd_head = hparams.n_embd_head_v;
5876
3449
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5877
3450
  LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5989,7 +3562,7 @@ struct llm_build_context {
5989
3562
  }
5990
3563
 
5991
3564
  struct lm_ggml_cgraph * build_qwen2moe() {
5992
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
3565
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
5993
3566
 
5994
3567
  // mutable variable, needed during the last layer of the computation to skip unused tokens
5995
3568
  int32_t n_tokens = this->n_tokens;
@@ -6137,7 +3710,7 @@ struct llm_build_context {
6137
3710
  }
6138
3711
 
6139
3712
  struct lm_ggml_cgraph * build_phi2() {
6140
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
3713
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
6141
3714
 
6142
3715
  const int64_t n_embd_head = hparams.n_embd_head_v;
6143
3716
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -6258,7 +3831,7 @@ struct llm_build_context {
6258
3831
  }
6259
3832
 
6260
3833
  struct lm_ggml_cgraph * build_phi3() {
6261
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
3834
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
6262
3835
 
6263
3836
  const int64_t n_embd_head = hparams.n_embd_head_v;
6264
3837
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -6291,7 +3864,7 @@ struct llm_build_context {
6291
3864
 
6292
3865
  struct lm_ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
6293
3866
  model.layers[il].attn_norm,
6294
- NULL,
3867
+ model.layers[il].attn_norm_b,
6295
3868
  LLM_NORM_RMS, cb, il);
6296
3869
  cb(attn_norm_output, "attn_norm", il);
6297
3870
 
@@ -6306,8 +3879,7 @@ struct llm_build_context {
6306
3879
  Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
6307
3880
  Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
6308
3881
  Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
6309
- }
6310
- else {
3882
+ } else {
6311
3883
  Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
6312
3884
  Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
6313
3885
  Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
@@ -6351,14 +3923,12 @@ struct llm_build_context {
6351
3923
  residual = cur;
6352
3924
 
6353
3925
  cur = llm_build_norm(ctx0, cur, hparams,
6354
- model.layers[il].ffn_norm, NULL,
3926
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
6355
3927
  LLM_NORM_RMS, cb, il);
6356
3928
  cb(cur, "ffn_norm", il);
6357
3929
 
6358
- // FF
6359
- // special-case: the up and gate tensors are merged into a single tensor
6360
- // TOOD: support into llm_build_ffn
6361
- {
3930
+ // feed-forward network
3931
+ if (model.layers[il].ffn_gate_inp == nullptr) {
6362
3932
  cur = llm_build_ffn(ctx0, lctx, cur,
6363
3933
  model.layers[il].ffn_up, NULL, NULL,
6364
3934
  NULL, NULL, NULL,
@@ -6366,6 +3936,20 @@ struct llm_build_context {
6366
3936
  NULL,
6367
3937
  LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
6368
3938
  cb(cur, "ffn_out", il);
3939
+ } else {
3940
+ // MoE branch
3941
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
3942
+ model.layers[il].ffn_gate_inp,
3943
+ model.layers[il].ffn_up_exps,
3944
+ model.layers[il].ffn_gate_exps,
3945
+ model.layers[il].ffn_down_exps,
3946
+ nullptr,
3947
+ n_expert, n_expert_used,
3948
+ LLM_FFN_SILU, true,
3949
+ false, 0.0,
3950
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
3951
+ cb, il);
3952
+ cb(cur, "ffn_moe_out", il);
6369
3953
  }
6370
3954
 
6371
3955
  cur = lm_ggml_add(ctx0, residual, cur);
@@ -6378,11 +3962,16 @@ struct llm_build_context {
6378
3962
 
6379
3963
  cur = llm_build_norm(ctx0, inpL, hparams,
6380
3964
  model.output_norm,
6381
- NULL,
3965
+ model.output_norm_b,
6382
3966
  LLM_NORM_RMS, cb, -1);
6383
3967
  cb(cur, "result_norm", -1);
6384
3968
 
6385
3969
  cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
3970
+
3971
+ if (model.output_b != nullptr) {
3972
+ cb(cur, "result_output_no_bias", -1);
3973
+ cur = lm_ggml_add(ctx0, cur, model.output_b);
3974
+ }
6386
3975
  cb(cur, "result_output", -1);
6387
3976
 
6388
3977
  lm_ggml_build_forward_expand(gf, cur);
@@ -6496,7 +4085,7 @@ struct llm_build_context {
6496
4085
  }
6497
4086
 
6498
4087
  struct lm_ggml_cgraph * build_gpt2() {
6499
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4088
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
6500
4089
 
6501
4090
  const int64_t n_embd_head = hparams.n_embd_head_v;
6502
4091
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -6601,7 +4190,7 @@ struct llm_build_context {
6601
4190
  }
6602
4191
 
6603
4192
  struct lm_ggml_cgraph * build_codeshell() {
6604
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4193
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
6605
4194
 
6606
4195
  const int64_t n_embd_head = hparams.n_embd_head_v;
6607
4196
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -6712,7 +4301,7 @@ struct llm_build_context {
6712
4301
  }
6713
4302
 
6714
4303
  struct lm_ggml_cgraph * build_orion() {
6715
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4304
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
6716
4305
 
6717
4306
  const int64_t n_embd_head = hparams.n_embd_head_v;
6718
4307
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6830,7 +4419,7 @@ struct llm_build_context {
6830
4419
  }
6831
4420
 
6832
4421
  struct lm_ggml_cgraph * build_internlm2() {
6833
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4422
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
6834
4423
 
6835
4424
  const int64_t n_embd_head = hparams.n_embd_head_v;
6836
4425
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6948,7 +4537,7 @@ struct llm_build_context {
6948
4537
  }
6949
4538
 
6950
4539
  struct lm_ggml_cgraph * build_minicpm3() {
6951
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4540
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
6952
4541
 
6953
4542
  //TODO: if the model varies, these parameters need to be read from the model
6954
4543
  const int64_t n_embd_base = 256;
@@ -7064,7 +4653,7 @@ struct llm_build_context {
7064
4653
  0);
7065
4654
  cb(v_states, "v_states", il);
7066
4655
 
7067
- q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
4656
+ q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
7068
4657
  q_pe = lm_ggml_rope_ext(
7069
4658
  ctx0, q_pe, inp_pos, rope_factors,
7070
4659
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -7073,7 +4662,7 @@ struct llm_build_context {
7073
4662
  cb(q_pe, "q_pe", il);
7074
4663
 
7075
4664
  // shared RoPE key
7076
- k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
4665
+ k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
7077
4666
  k_pe = lm_ggml_rope_ext(
7078
4667
  ctx0, k_pe, inp_pos, rope_factors,
7079
4668
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -7157,7 +4746,7 @@ struct llm_build_context {
7157
4746
  }
7158
4747
 
7159
4748
  struct lm_ggml_cgraph * build_gemma() {
7160
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4749
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7161
4750
 
7162
4751
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
7163
4752
 
@@ -7265,7 +4854,7 @@ struct llm_build_context {
7265
4854
  }
7266
4855
 
7267
4856
  struct lm_ggml_cgraph * build_gemma2() {
7268
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4857
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7269
4858
 
7270
4859
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
7271
4860
 
@@ -7315,9 +4904,9 @@ struct llm_build_context {
7315
4904
 
7316
4905
  // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
7317
4906
  switch (model.type) {
7318
- case llm_type::MODEL_2B:
7319
- case llm_type::MODEL_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
7320
- case llm_type::MODEL_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
4907
+ case LLM_TYPE_2B:
4908
+ case LLM_TYPE_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
4909
+ case LLM_TYPE_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
7321
4910
  default: LM_GGML_ABORT("fatal error");
7322
4911
  };
7323
4912
  cb(Qcur, "Qcur_scaled", il);
@@ -7401,7 +4990,7 @@ struct llm_build_context {
7401
4990
 
7402
4991
 
7403
4992
  struct lm_ggml_cgraph * build_starcoder2() {
7404
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
4993
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7405
4994
 
7406
4995
  const int64_t n_embd_head = hparams.n_embd_head_v;
7407
4996
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7520,7 +5109,7 @@ struct llm_build_context {
7520
5109
  }
7521
5110
 
7522
5111
  struct lm_ggml_cgraph * build_mamba() {
7523
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5112
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7524
5113
 
7525
5114
  struct lm_ggml_tensor * cur;
7526
5115
  struct lm_ggml_tensor * inpL;
@@ -7575,7 +5164,7 @@ struct llm_build_context {
7575
5164
 
7576
5165
  struct lm_ggml_cgraph * build_command_r() {
7577
5166
 
7578
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5167
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7579
5168
 
7580
5169
  const int64_t n_embd_head = hparams.n_embd_head_v;
7581
5170
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7723,7 +5312,7 @@ struct llm_build_context {
7723
5312
  }
7724
5313
 
7725
5314
  struct lm_ggml_cgraph * build_cohere2() {
7726
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5315
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7727
5316
 
7728
5317
  const int64_t n_embd_head = hparams.n_embd_head_v;
7729
5318
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7860,7 +5449,7 @@ struct llm_build_context {
7860
5449
  // * removed bias
7861
5450
  // * removed MoE
7862
5451
  struct lm_ggml_cgraph * build_olmo() {
7863
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5452
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7864
5453
 
7865
5454
  // mutable variable, needed during the last layer of the computation to skip unused tokens
7866
5455
  int32_t n_tokens = this->n_tokens;
@@ -7984,7 +5573,7 @@ struct llm_build_context {
7984
5573
  }
7985
5574
 
7986
5575
  struct lm_ggml_cgraph * build_olmo2() {
7987
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5576
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7988
5577
 
7989
5578
  // mutable variable, needed during the last layer of the computation to skip unused tokens
7990
5579
  int32_t n_tokens = this->n_tokens;
@@ -8112,7 +5701,7 @@ struct llm_build_context {
8112
5701
  // * removed bias
8113
5702
  // * added q, k norm
8114
5703
  struct lm_ggml_cgraph * build_olmoe() {
8115
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5704
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
8116
5705
 
8117
5706
  // mutable variable, needed during the last layer of the computation to skip unused tokens
8118
5707
  int32_t n_tokens = this->n_tokens;
@@ -8238,7 +5827,7 @@ struct llm_build_context {
8238
5827
  }
8239
5828
 
8240
5829
  struct lm_ggml_cgraph * build_openelm() {
8241
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5830
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
8242
5831
 
8243
5832
  const int64_t n_embd_head = hparams.n_embd_head_v;
8244
5833
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8363,7 +5952,7 @@ struct llm_build_context {
8363
5952
  }
8364
5953
 
8365
5954
  struct lm_ggml_cgraph * build_gptneox() {
8366
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
5955
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
8367
5956
 
8368
5957
  const int64_t n_embd_head = hparams.n_embd_head_v;
8369
5958
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -8505,7 +6094,7 @@ struct llm_build_context {
8505
6094
  }
8506
6095
 
8507
6096
  struct lm_ggml_cgraph * build_arctic() {
8508
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
6097
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
8509
6098
 
8510
6099
  // mutable variable, needed during the last layer of the computation to skip unused tokens
8511
6100
  int32_t n_tokens = this->n_tokens;
@@ -8639,7 +6228,7 @@ struct llm_build_context {
8639
6228
  }
8640
6229
 
8641
6230
  struct lm_ggml_cgraph * build_deepseek() {
8642
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
6231
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
8643
6232
 
8644
6233
  // mutable variable, needed during the last layer of the computation to skip unused tokens
8645
6234
  int32_t n_tokens = this->n_tokens;
@@ -8796,7 +6385,7 @@ struct llm_build_context {
8796
6385
  }
8797
6386
 
8798
6387
  struct lm_ggml_cgraph * build_deepseek2() {
8799
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
6388
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
8800
6389
 
8801
6390
  // mutable variable, needed during the last layer of the computation to skip unused tokens
8802
6391
  int32_t n_tokens = this->n_tokens;
@@ -8918,7 +6507,7 @@ struct llm_build_context {
8918
6507
  0);
8919
6508
  cb(v_states, "v_states", il);
8920
6509
 
8921
- q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
6510
+ q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8922
6511
  q_pe = lm_ggml_rope_ext(
8923
6512
  ctx0, q_pe, inp_pos, nullptr,
8924
6513
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8927,7 +6516,7 @@ struct llm_build_context {
8927
6516
  cb(q_pe, "q_pe", il);
8928
6517
 
8929
6518
  // shared RoPE key
8930
- k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
6519
+ k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8931
6520
  k_pe = lm_ggml_rope_ext(
8932
6521
  ctx0, k_pe, inp_pos, nullptr,
8933
6522
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9026,7 +6615,7 @@ struct llm_build_context {
9026
6615
  }
9027
6616
 
9028
6617
  struct lm_ggml_cgraph * build_bitnet() {
9029
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
6618
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9030
6619
 
9031
6620
  const int64_t n_embd_head = hparams.n_embd_head_v;
9032
6621
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9177,7 +6766,7 @@ struct llm_build_context {
9177
6766
  }
9178
6767
 
9179
6768
  struct lm_ggml_cgraph * build_t5_enc() {
9180
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
6769
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9181
6770
 
9182
6771
  // mutable variable, needed during the last layer of the computation to skip unused tokens
9183
6772
  int32_t n_tokens = this->n_tokens;
@@ -9309,7 +6898,7 @@ struct llm_build_context {
9309
6898
  }
9310
6899
 
9311
6900
  struct lm_ggml_cgraph * build_t5_dec() {
9312
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
6901
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9313
6902
 
9314
6903
  // mutable variable, needed during the last layer of the computation to skip unused tokens
9315
6904
  int32_t n_tokens = this->n_tokens;
@@ -9514,7 +7103,7 @@ struct llm_build_context {
9514
7103
  }
9515
7104
 
9516
7105
  struct lm_ggml_cgraph * build_jais() {
9517
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
7106
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9518
7107
 
9519
7108
  const int64_t n_embd_head = hparams.n_embd_head_v;
9520
7109
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9606,7 +7195,7 @@ struct llm_build_context {
9606
7195
  }
9607
7196
 
9608
7197
  struct lm_ggml_cgraph * build_chatglm() {
9609
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
7198
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9610
7199
 
9611
7200
  const int64_t n_embd_head = hparams.n_embd_head_v;
9612
7201
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -9720,7 +7309,7 @@ struct llm_build_context {
9720
7309
  }
9721
7310
 
9722
7311
  struct lm_ggml_cgraph * build_nemotron() {
9723
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
7312
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9724
7313
 
9725
7314
  const int64_t n_embd_head = hparams.n_embd_head_v;
9726
7315
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9841,7 +7430,7 @@ struct llm_build_context {
9841
7430
  }
9842
7431
 
9843
7432
  struct lm_ggml_cgraph * build_exaone() {
9844
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
7433
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9845
7434
 
9846
7435
  // mutable variable, needed during the last layer of the computation to skip unused tokens
9847
7436
  int32_t n_tokens = this->n_tokens;
@@ -9968,7 +7557,7 @@ struct llm_build_context {
9968
7557
  }
9969
7558
 
9970
7559
  lm_ggml_cgraph * build_rwkv6() {
9971
- lm_ggml_cgraph *gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
7560
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
9972
7561
 
9973
7562
  // Token shift state dimensions should be 2 * n_emb
9974
7563
  LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
@@ -10013,7 +7602,7 @@ struct llm_build_context {
10013
7602
  1
10014
7603
  );
10015
7604
 
10016
- cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
7605
+ cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
10017
7606
  lm_ggml_build_forward_expand(gf, cur);
10018
7607
  lm_ggml_build_forward_expand(
10019
7608
  gf,
@@ -10080,6 +7669,118 @@ struct llm_build_context {
10080
7669
  return gf;
10081
7670
  }
10082
7671
 
7672
+ // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
7673
+ lm_ggml_cgraph * build_rwkv6qwen2() {
7674
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
7675
+
7676
+ LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s());
7677
+
7678
+ const int64_t n_seqs = ubatch.n_seqs;
7679
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
7680
+ const int64_t n_tokens = ubatch.n_tokens;
7681
+ LM_GGML_ASSERT(n_seqs != 0);
7682
+ LM_GGML_ASSERT(ubatch.equal_seqs);
7683
+ LM_GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
7684
+
7685
+ struct lm_ggml_tensor * cur;
7686
+ struct lm_ggml_tensor * inpL;
7687
+ struct lm_ggml_tensor * state_copy = build_inp_s_copy();
7688
+ struct lm_ggml_tensor * state_mask = build_inp_s_mask();
7689
+
7690
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
7691
+
7692
+ for (int il = 0; il < n_layer; ++il) {
7693
+ const llama_layer * layer = &model.layers[il];
7694
+
7695
+ // (ab)using the KV cache to store the states
7696
+ struct lm_ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
7697
+ gf, kv_self.k_l[il], state_copy, state_mask,
7698
+ hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
7699
+ struct lm_ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
7700
+ gf, kv_self.v_l[il], state_copy, state_mask,
7701
+ hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
7702
+
7703
+ cur = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
7704
+ token_shift = lm_ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
7705
+
7706
+ struct lm_ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il);
7707
+ struct lm_ggml_tensor * x_prev = lm_ggml_concat(
7708
+ ctx0,
7709
+ token_shift,
7710
+ lm_ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
7711
+ 1
7712
+ );
7713
+
7714
+ lm_ggml_build_forward_expand(
7715
+ gf,
7716
+ lm_ggml_cpy(
7717
+ ctx0,
7718
+ wkv_states,
7719
+ lm_ggml_view_1d(
7720
+ ctx0,
7721
+ kv_self.v_l[il],
7722
+ hparams.n_embd_v_s() * n_seqs,
7723
+ hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
7724
+ )
7725
+ )
7726
+ );
7727
+
7728
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
7729
+ lm_ggml_build_forward_expand(gf, ffn_inp);
7730
+ lm_ggml_build_forward_expand(
7731
+ gf,
7732
+ lm_ggml_cpy(
7733
+ ctx0,
7734
+ wkv_states,
7735
+ lm_ggml_view_1d(
7736
+ ctx0,
7737
+ kv_self.v_l[il],
7738
+ hparams.n_embd_v_s() * n_seqs,
7739
+ hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
7740
+ )
7741
+ )
7742
+ );
7743
+
7744
+ cb(ffn_inp, "ffn_inp", il);
7745
+
7746
+ // feed-forward network
7747
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7748
+ model.layers[il].ffn_norm, NULL,
7749
+ LLM_NORM_RMS, cb, il);
7750
+ cb(cur, "ffn_norm", il);
7751
+
7752
+ cur = llm_build_ffn(ctx0, lctx, cur,
7753
+ model.layers[il].ffn_up, NULL, NULL,
7754
+ model.layers[il].ffn_gate, NULL, NULL,
7755
+ model.layers[il].ffn_down, NULL, NULL,
7756
+ NULL,
7757
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7758
+ cb(cur, "ffn_out", il);
7759
+
7760
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
7761
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
7762
+ cb(cur, "l_out", il);
7763
+
7764
+ // input for next layer
7765
+ inpL = cur;
7766
+ }
7767
+
7768
+ cur = inpL;
7769
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
7770
+ cur = lm_ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
7771
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
7772
+
7773
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1);
7774
+ cb(cur, "result_norm", -1);
7775
+
7776
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
7777
+ cb(cur, "result_output", -1);
7778
+
7779
+ lm_ggml_build_forward_expand(gf, cur);
7780
+
7781
+ return gf;
7782
+ }
7783
+
10083
7784
  // ref: https://github.com/facebookresearch/chameleon
10084
7785
  // based on the original build_llama() function, changes:
10085
7786
  // * qk-norm
@@ -10087,7 +7788,7 @@ struct llm_build_context {
10087
7788
  // * removed bias
10088
7789
  // * removed MoE
10089
7790
  struct lm_ggml_cgraph * build_chameleon() {
10090
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
7791
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
10091
7792
 
10092
7793
  // mutable variable, needed during the last layer of the computation to skip unused tokens
10093
7794
  int32_t n_tokens = this->n_tokens;
@@ -10259,7 +7960,7 @@ struct llm_build_context {
10259
7960
  }
10260
7961
 
10261
7962
  struct lm_ggml_cgraph * build_wavtokenizer_dec() {
10262
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
7963
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
10263
7964
 
10264
7965
  struct lm_ggml_tensor * cur;
10265
7966
  struct lm_ggml_tensor * inpL;
@@ -10468,12 +8169,12 @@ static struct lm_ggml_cgraph * llama_build_graph(
10468
8169
 
10469
8170
  // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
10470
8171
  // FIXME: fix in lm_ggml_backend_sched
10471
- const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
8172
+ const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer;
10472
8173
  if (ubatch.n_tokens < 32 || full_offload) {
10473
8174
  if (il != -1 && strcmp(name, "norm") == 0) {
10474
- const auto & dev_layer = lctx.model.dev_layer.at(il);
8175
+ const auto & dev_layer = lctx.model.dev_layer(il);
10475
8176
  for (auto & backend : lctx.backends) {
10476
- if (lm_ggml_backend_get_device(backend.get()) == dev_layer.dev) {
8177
+ if (lm_ggml_backend_get_device(backend.get()) == dev_layer) {
10477
8178
  if (lm_ggml_backend_supports_op(backend.get(), cur)) {
10478
8179
  lm_ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
10479
8180
  }
@@ -10561,6 +8262,7 @@ static struct lm_ggml_cgraph * llama_build_graph(
10561
8262
  result = llm.build_phi2();
10562
8263
  } break;
10563
8264
  case LLM_ARCH_PHI3:
8265
+ case LLM_ARCH_PHIMOE:
10564
8266
  {
10565
8267
  result = llm.build_phi3();
10566
8268
  } break;
@@ -10688,6 +8390,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
10688
8390
  {
10689
8391
  result = llm.build_rwkv6();
10690
8392
  } break;
8393
+ case LLM_ARCH_RWKV6QWEN2:
8394
+ {
8395
+ result = llm.build_rwkv6qwen2();
8396
+ } break;
10691
8397
  case LLM_ARCH_CHAMELEON:
10692
8398
  {
10693
8399
  result = llm.build_chameleon();
@@ -10767,6 +8473,7 @@ static int llama_decode_impl(
10767
8473
  const uint32_t n_tokens_all = batch.n_tokens;
10768
8474
 
10769
8475
  const auto & model = lctx.model;
8476
+ const auto & vocab = model.vocab;
10770
8477
  const auto & hparams = model.hparams;
10771
8478
  const auto & cparams = lctx.cparams;
10772
8479
 
@@ -10774,7 +8481,7 @@ static int llama_decode_impl(
10774
8481
 
10775
8482
  if (batch.token) {
10776
8483
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
10777
- if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
8484
+ if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
10778
8485
  LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
10779
8486
  return -1;
10780
8487
  }
@@ -10794,7 +8501,7 @@ static int llama_decode_impl(
10794
8501
  llama_kv_slot_restorer kv_slot_restorer(kv_self);
10795
8502
 
10796
8503
  const int64_t n_embd = hparams.n_embd;
10797
- const int64_t n_vocab = hparams.n_vocab;
8504
+ const int64_t n_vocab = vocab.n_tokens();
10798
8505
 
10799
8506
  uint32_t n_outputs = 0;
10800
8507
  uint32_t n_outputs_prev = 0;
@@ -11109,7 +8816,7 @@ static int llama_encode_impl(
11109
8816
 
11110
8817
  if (batch.token) {
11111
8818
  for (uint32_t i = 0; i < n_tokens; ++i) {
11112
- if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
8819
+ if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
11113
8820
  LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
11114
8821
  return -1;
11115
8822
  }
@@ -11286,9 +8993,9 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
11286
8993
  // each move requires 6*n_layer tensors (see build_defrag)
11287
8994
  // - source view, destination view, copy operation
11288
8995
  // - x2 for keys and values
11289
- //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
8996
+ //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
11290
8997
  // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11291
- const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
8998
+ const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer);
11292
8999
 
11293
9000
  // determine which KV cells to move where
11294
9001
  //
@@ -11535,7 +9242,7 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
11535
9242
  // build worst-case graph
11536
9243
  uint32_t n_seqs = 1; // TODO: worst-case number of sequences
11537
9244
  uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
11538
- llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9245
+ llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
11539
9246
  llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
11540
9247
  lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
11541
9248
 
@@ -11547,39 +9254,38 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
11547
9254
  }
11548
9255
  }
11549
9256
 
11550
- int32_t llama_lora_adapter_set(
9257
+ int32_t llama_set_adapter_lora(
11551
9258
  struct llama_context * ctx,
11552
- struct llama_lora_adapter * adapter,
9259
+ struct llama_adapter_lora * adapter,
11553
9260
  float scale) {
11554
- ctx->lora_adapters[adapter] = scale;
9261
+ ctx->lora[adapter] = scale;
11555
9262
  return 0;
11556
9263
  }
11557
9264
 
11558
- int32_t llama_lora_adapter_remove(
9265
+ int32_t llama_rm_adapter_lora(
11559
9266
  struct llama_context * ctx,
11560
- struct llama_lora_adapter * adapter) {
11561
- auto pos = ctx->lora_adapters.find(adapter);
11562
- if (pos != ctx->lora_adapters.end()) {
11563
- ctx->lora_adapters.erase(pos);
9267
+ struct llama_adapter_lora * adapter) {
9268
+ auto pos = ctx->lora.find(adapter);
9269
+ if (pos != ctx->lora.end()) {
9270
+ ctx->lora.erase(pos);
11564
9271
  return 0;
11565
9272
  }
11566
9273
 
11567
9274
  return -1;
11568
9275
  }
11569
9276
 
11570
- void llama_lora_adapter_clear(struct llama_context * ctx) {
11571
- ctx->lora_adapters.clear();
9277
+ void llama_clear_adapter_lora(struct llama_context * ctx) {
9278
+ ctx->lora.clear();
11572
9279
  }
11573
9280
 
11574
- // TODO: tmp
11575
- int32_t llama_control_vector_apply(
11576
- struct llama_context * lctx,
9281
+ int32_t llama_apply_adapter_cvec(
9282
+ struct llama_context * ctx,
11577
9283
  const float * data,
11578
9284
  size_t len,
11579
9285
  int32_t n_embd,
11580
9286
  int32_t il_start,
11581
9287
  int32_t il_end) {
11582
- return llama_control_vector_apply(lctx->cvec, lctx->model, data, len, n_embd, il_start, il_end);
9288
+ return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
11583
9289
  }
11584
9290
 
11585
9291
  //
@@ -11679,18 +9385,13 @@ int64_t llama_time_us(void) {
11679
9385
  return lm_ggml_time_us();
11680
9386
  }
11681
9387
 
11682
- struct llama_model * llama_load_model_from_file(
11683
- const char * path_model,
11684
- struct llama_model_params params) {
11685
- return llama_model_load_from_file(path_model, params);
11686
- }
11687
-
11688
- struct llama_model * llama_model_load_from_file(
11689
- const char * path_model,
9388
+ static struct llama_model * llama_model_load_from_file_impl(
9389
+ const std::string & path_model,
9390
+ std::vector<std::string> & splits,
11690
9391
  struct llama_model_params params) {
11691
9392
  lm_ggml_time_init();
11692
9393
 
11693
- llama_model * model = new llama_model;
9394
+ llama_model * model = new llama_model(params);
11694
9395
 
11695
9396
  unsigned cur_percentage = 0;
11696
9397
  if (params.progress_callback == NULL) {
@@ -11709,47 +9410,6 @@ struct llama_model * llama_model_load_from_file(
11709
9410
  };
11710
9411
  }
11711
9412
 
11712
- if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
11713
- // split the servers set them into model->rpc_servers
11714
- std::string servers(params.rpc_servers);
11715
- size_t pos = 0;
11716
- while ((pos = servers.find(',')) != std::string::npos) {
11717
- std::string server = servers.substr(0, pos);
11718
- model->rpc_servers.push_back(server);
11719
- servers.erase(0, pos + 1);
11720
- }
11721
- model->rpc_servers.push_back(servers);
11722
- }
11723
-
11724
- // add RPC devices
11725
- if (!model->rpc_servers.empty()) {
11726
- lm_ggml_backend_reg_t rpc_reg = lm_ggml_backend_reg_by_name("RPC");
11727
- if (!rpc_reg) {
11728
- LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
11729
- llama_model_free(model);
11730
- return nullptr;
11731
- }
11732
-
11733
- typedef lm_ggml_backend_dev_t (*lm_ggml_backend_rpc_add_device_t)(const char * endpoint);
11734
- lm_ggml_backend_rpc_add_device_t lm_ggml_backend_rpc_add_device_fn = (lm_ggml_backend_rpc_add_device_t) lm_ggml_backend_reg_get_proc_address(rpc_reg, "lm_ggml_backend_rpc_add_device");
11735
- if (!lm_ggml_backend_rpc_add_device_fn) {
11736
- LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
11737
- llama_model_free(model);
11738
- return nullptr;
11739
- }
11740
-
11741
- for (const std::string & server : model->rpc_servers) {
11742
- lm_ggml_backend_dev_t dev = lm_ggml_backend_rpc_add_device_fn(server.c_str());
11743
- if (dev) {
11744
- model->devices.push_back(dev);
11745
- } else {
11746
- LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
11747
- llama_model_free(model);
11748
- return nullptr;
11749
- }
11750
- }
11751
- }
11752
-
11753
9413
  // create list of devices to use with this model
11754
9414
  if (params.devices) {
11755
9415
  for (lm_ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
@@ -11790,7 +9450,7 @@ struct llama_model * llama_model_load_from_file(
11790
9450
  LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, lm_ggml_backend_dev_name(dev), lm_ggml_backend_dev_description(dev), free/1024/1024);
11791
9451
  }
11792
9452
 
11793
- int status = llama_model_load(path_model, *model, params);
9453
+ const int status = llama_model_load(path_model, splits, *model, params);
11794
9454
  LM_GGML_ASSERT(status <= 0);
11795
9455
  if (status < 0) {
11796
9456
  if (status == -1) {
@@ -11806,7 +9466,36 @@ struct llama_model * llama_model_load_from_file(
11806
9466
  return model;
11807
9467
  }
11808
9468
 
11809
- struct llama_context * llama_new_context_with_model(
9469
+ // deprecated
9470
+ struct llama_model * llama_load_model_from_file(
9471
+ const char * path_model,
9472
+ struct llama_model_params params) {
9473
+ return llama_model_load_from_file(path_model, params);
9474
+ }
9475
+
9476
+ struct llama_model * llama_model_load_from_file(
9477
+ const char * path_model,
9478
+ struct llama_model_params params) {
9479
+ std::vector<std::string> splits = {};
9480
+ return llama_model_load_from_file_impl(path_model, splits, params);
9481
+ }
9482
+
9483
+ struct llama_model * llama_model_load_from_splits(
9484
+ const char ** paths,
9485
+ size_t n_paths,
9486
+ struct llama_model_params params) {
9487
+ std::vector<std::string> splits;
9488
+ if (n_paths == 0) {
9489
+ LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
9490
+ return nullptr;
9491
+ }
9492
+ for (size_t i = 0; i < n_paths; ++i) {
9493
+ splits.push_back(paths[i]);
9494
+ }
9495
+ return llama_model_load_from_file_impl(splits.front(), splits, params);
9496
+ }
9497
+
9498
+ struct llama_context * llama_init_from_model(
11810
9499
  struct llama_model * model,
11811
9500
  struct llama_context_params params) {
11812
9501
 
@@ -12064,7 +9753,7 @@ struct llama_context * llama_new_context_with_model(
12064
9753
  backend_ptrs.push_back(backend.get());
12065
9754
  }
12066
9755
 
12067
- const size_t max_nodes = llama_model_max_nodes(*model);
9756
+ const size_t max_nodes = model->max_nodes();
12068
9757
 
12069
9758
  // buffer used to store the computation graph and the tensor meta data
12070
9759
  ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*max_nodes + lm_ggml_graph_overhead_custom(max_nodes, false));
@@ -12072,9 +9761,9 @@ struct llama_context * llama_new_context_with_model(
12072
9761
  // TODO: move these checks to lm_ggml_backend_sched
12073
9762
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
12074
9763
  bool pipeline_parallel =
12075
- llama_get_device_count(*model) > 1 &&
12076
- model->n_gpu_layers > (int)model->hparams.n_layer &&
12077
- model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
9764
+ model->n_devices() > 1 &&
9765
+ model->params.n_gpu_layers > (int)model->hparams.n_layer &&
9766
+ model->params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
12078
9767
  params.offload_kqv;
12079
9768
 
12080
9769
  // pipeline parallelism requires support for async compute and events in all devices
@@ -12105,7 +9794,7 @@ struct llama_context * llama_new_context_with_model(
12105
9794
  // initialize scheduler with the worst-case graph
12106
9795
  uint32_t n_seqs = 1; // TODO: worst-case number of sequences
12107
9796
  uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
12108
- llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9797
+ llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
12109
9798
 
12110
9799
  llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
12111
9800
  lm_ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
@@ -12157,6 +9846,12 @@ struct llama_context * llama_new_context_with_model(
12157
9846
  return ctx;
12158
9847
  }
12159
9848
 
9849
+ struct llama_context * llama_new_context_with_model(
9850
+ struct llama_model * model,
9851
+ struct llama_context_params params) {
9852
+ return llama_init_from_model(model, params);
9853
+ }
9854
+
12160
9855
  //
12161
9856
  // kv cache
12162
9857
  //
@@ -12254,166 +9949,18 @@ int32_t llama_decode(
12254
9949
  return ret;
12255
9950
  }
12256
9951
 
12257
- //
12258
- // vocab
12259
- //
12260
-
12261
- // TODO: tmp bridges below until `struct llama_vocab` is exposed through the public API
12262
-
12263
- const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
12264
- return llama_token_get_text_impl(model->vocab, token);
12265
- }
12266
-
12267
- float llama_token_get_score(const struct llama_model * model, llama_token token) {
12268
- return llama_token_get_score_impl(model->vocab, token);
12269
- }
12270
-
12271
- enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
12272
- return llama_token_get_attr_impl(model->vocab, token);
12273
- }
12274
-
12275
- bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
12276
- return llama_token_is_eog_impl(model->vocab, token);
12277
- }
12278
-
12279
- bool llama_token_is_control(const struct llama_model * model, llama_token token) {
12280
- return llama_token_is_control_impl(model->vocab, token);
12281
- }
12282
-
12283
- llama_token llama_token_bos(const struct llama_model * model) {
12284
- return llama_token_bos_impl(model->vocab);
12285
- }
12286
-
12287
- llama_token llama_token_eos(const struct llama_model * model) {
12288
- return llama_token_eos_impl(model->vocab);
12289
- }
12290
-
12291
- llama_token llama_token_eot(const struct llama_model * model) {
12292
- return llama_token_eot_impl(model->vocab);
12293
- }
12294
-
12295
- llama_token llama_token_cls(const struct llama_model * model) {
12296
- return llama_token_cls_impl(model->vocab);
12297
- }
12298
-
12299
- llama_token llama_token_sep(const struct llama_model * model) {
12300
- return llama_token_sep_impl(model->vocab);
12301
- }
12302
-
12303
- llama_token llama_token_nl (const struct llama_model * model) {
12304
- return llama_token_nl_impl(model->vocab);
12305
- }
12306
-
12307
- llama_token llama_token_pad(const struct llama_model * model) {
12308
- return llama_token_pad_impl(model->vocab);
12309
- }
12310
-
12311
- bool llama_add_bos_token(const struct llama_model * model) {
12312
- return llama_add_bos_token_impl(model->vocab);
12313
- }
12314
-
12315
- bool llama_add_eos_token(const struct llama_model * model) {
12316
- return llama_add_eos_token_impl(model->vocab);
12317
- }
12318
-
12319
- llama_token llama_token_prefix(const struct llama_model * model) {
12320
- return llama_token_prefix_impl(model->vocab);
12321
- }
12322
-
12323
- llama_token llama_token_middle(const struct llama_model * model) {
12324
- return llama_token_middle_impl(model->vocab);
12325
- }
12326
-
12327
- llama_token llama_token_suffix(const struct llama_model * model) {
12328
- return llama_token_suffix_impl(model->vocab);
12329
- }
12330
-
12331
- llama_token llama_token_fim_pre(const struct llama_model * model) {
12332
- return llama_token_fim_pre_impl(model->vocab);
12333
- }
12334
-
12335
- llama_token llama_token_fim_suf(const struct llama_model * model) {
12336
- return llama_token_fim_suf_impl(model->vocab);
12337
- }
12338
-
12339
- llama_token llama_token_fim_mid(const struct llama_model * model) {
12340
- return llama_token_fim_mid_impl(model->vocab);
12341
- }
12342
-
12343
- llama_token llama_token_fim_pad(const struct llama_model * model) {
12344
- return llama_token_fim_pad_impl(model->vocab);
12345
- }
12346
-
12347
- llama_token llama_token_fim_rep(const struct llama_model * model) {
12348
- return llama_token_fim_rep_impl(model->vocab);
12349
- }
12350
-
12351
- llama_token llama_token_fim_sep(const struct llama_model * model) {
12352
- return llama_token_fim_sep_impl(model->vocab);
12353
- }
12354
-
12355
- //
12356
- // tokenization
12357
- //
12358
-
12359
- int32_t llama_tokenize(
12360
- const struct llama_model * model,
12361
- const char * text,
12362
- int32_t text_len,
12363
- llama_token * tokens,
12364
- int32_t n_tokens_max,
12365
- bool add_special,
12366
- bool parse_special) {
12367
- return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
12368
- }
12369
-
12370
- int32_t llama_token_to_piece(
12371
- const struct llama_model * model,
12372
- llama_token token,
12373
- char * buf,
12374
- int32_t length,
12375
- int32_t lstrip,
12376
- bool special) {
12377
- return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
12378
- }
12379
-
12380
- int32_t llama_detokenize(
12381
- const struct llama_model * model,
12382
- const llama_token * tokens,
12383
- int32_t n_tokens,
12384
- char * text,
12385
- int32_t text_len_max,
12386
- bool remove_special,
12387
- bool unparse_special) {
12388
- return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
12389
- }
12390
-
12391
9952
  //
12392
9953
  // chat templates
12393
9954
  //
12394
9955
 
12395
9956
  int32_t llama_chat_apply_template(
12396
- const struct llama_model * model,
12397
9957
  const char * tmpl,
12398
9958
  const struct llama_chat_message * chat,
12399
9959
  size_t n_msg,
12400
9960
  bool add_ass,
12401
9961
  char * buf,
12402
9962
  int32_t length) {
12403
- std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
12404
- if (tmpl == nullptr) {
12405
- LM_GGML_ASSERT(model != nullptr);
12406
-
12407
- // load template from model, if available
12408
- const auto & it = model->lm_gguf_kv.find("tokenizer.chat_template");
12409
- if (it != model->lm_gguf_kv.end() && it->second.size() > 0) {
12410
- curr_tmpl = it->second;
12411
- }
12412
- else {
12413
- // worst case: there is no information about template, we will use chatml by default
12414
- curr_tmpl = "chatml"; // see llm_chat_apply_template
12415
- }
12416
- }
9963
+ const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
12417
9964
 
12418
9965
  // format the chat to string
12419
9966
  std::vector<const llama_chat_message *> chat_vec;
@@ -12437,23 +9984,6 @@ int32_t llama_chat_apply_template(
12437
9984
  return res;
12438
9985
  }
12439
9986
 
12440
- //
12441
- // sampling
12442
- //
12443
-
12444
- // TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
12445
- struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
12446
- return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
12447
- }
12448
-
12449
- struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
12450
- return llama_sampler_init_infill_impl(model->vocab);
12451
- }
12452
-
12453
- struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
12454
- return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
12455
- }
12456
-
12457
9987
  //
12458
9988
  // model split
12459
9989
  //