cui-llama.rn 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +9 -9
- package/cpp/common.cpp +163 -60
- package/cpp/common.h +43 -12
- package/cpp/ggml-alloc.c +1042 -1037
- package/cpp/ggml-backend-impl.h +255 -256
- package/cpp/ggml-backend-reg.cpp +582 -582
- package/cpp/ggml-backend.cpp +2002 -2002
- package/cpp/ggml-backend.h +354 -352
- package/cpp/ggml-common.h +1853 -1853
- package/cpp/ggml-cpp.h +39 -39
- package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
- package/cpp/ggml-cpu-aarch64.h +8 -8
- package/cpp/ggml-cpu-impl.h +386 -386
- package/cpp/ggml-cpu-quants.c +10920 -10839
- package/cpp/ggml-cpu-traits.cpp +36 -36
- package/cpp/ggml-cpu-traits.h +38 -38
- package/cpp/ggml-cpu.c +329 -60
- package/cpp/ggml-cpu.cpp +10 -2
- package/cpp/ggml-cpu.h +135 -135
- package/cpp/ggml-impl.h +567 -567
- package/cpp/ggml-metal-impl.h +17 -17
- package/cpp/ggml-metal.m +4884 -4884
- package/cpp/ggml-quants.c +5238 -5238
- package/cpp/ggml-threading.h +14 -14
- package/cpp/ggml.c +6514 -6448
- package/cpp/ggml.h +2194 -2163
- package/cpp/gguf.cpp +1329 -1325
- package/cpp/gguf.h +202 -202
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json-schema-to-grammar.h +8 -8
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-adapter.cpp +347 -346
- package/cpp/llama-adapter.h +74 -73
- package/cpp/llama-arch.cpp +1487 -1434
- package/cpp/llama-arch.h +400 -395
- package/cpp/llama-batch.cpp +368 -368
- package/cpp/llama-batch.h +88 -88
- package/cpp/llama-chat.cpp +578 -567
- package/cpp/llama-chat.h +52 -51
- package/cpp/llama-context.cpp +1775 -1771
- package/cpp/llama-context.h +128 -128
- package/cpp/llama-cparams.cpp +1 -1
- package/cpp/llama-cparams.h +37 -37
- package/cpp/llama-cpp.h +30 -30
- package/cpp/llama-grammar.cpp +1139 -1139
- package/cpp/llama-grammar.h +143 -143
- package/cpp/llama-hparams.cpp +71 -71
- package/cpp/llama-hparams.h +139 -140
- package/cpp/llama-impl.cpp +167 -167
- package/cpp/llama-impl.h +61 -61
- package/cpp/llama-kv-cache.cpp +718 -718
- package/cpp/llama-kv-cache.h +218 -218
- package/cpp/llama-mmap.cpp +2 -1
- package/cpp/llama-mmap.h +67 -67
- package/cpp/llama-model-loader.cpp +1124 -1011
- package/cpp/llama-model-loader.h +167 -158
- package/cpp/llama-model.cpp +3997 -2202
- package/cpp/llama-model.h +370 -391
- package/cpp/llama-sampling.cpp +2408 -2406
- package/cpp/llama-sampling.h +32 -48
- package/cpp/llama-vocab.cpp +3247 -1982
- package/cpp/llama-vocab.h +125 -182
- package/cpp/llama.cpp +416 -2886
- package/cpp/llama.h +1323 -1285
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.hpp +18 -12
- package/cpp/sampling.cpp +505 -500
- package/cpp/sgemm.cpp +2597 -2597
- package/cpp/speculative.cpp +277 -274
- package/cpp/speculative.h +28 -28
- package/cpp/unicode.cpp +2 -3
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -8,2512 +8,80 @@
|
|
8
8
|
#include "llama-kv-cache.h"
|
9
9
|
#include "llama-model-loader.h"
|
10
10
|
#include "llama-model.h"
|
11
|
-
#include "llama-kv-cache.h"
|
12
|
-
#include "llama-model-loader.h"
|
13
|
-
#include "llama-model.h"
|
14
|
-
|
15
|
-
#include "ggml.h"
|
16
|
-
#include "ggml-alloc.h"
|
17
|
-
#include "ggml-backend.h"
|
18
|
-
#include "ggml-cpp.h"
|
19
|
-
|
20
|
-
#include <algorithm>
|
21
|
-
#include <array>
|
22
|
-
#include <cassert>
|
23
|
-
#include <cfloat>
|
24
|
-
#include <cmath>
|
25
|
-
#include <cstddef>
|
26
|
-
#include <cstdint>
|
27
|
-
#include <cstdio>
|
28
|
-
#include <cstring>
|
29
|
-
#include <ctime>
|
30
|
-
#include <functional>
|
31
|
-
#include <initializer_list>
|
32
|
-
#include <map>
|
33
|
-
|
34
|
-
#if defined(_MSC_VER)
|
35
|
-
#pragma warning(disable: 4244 4267) // possible loss of data
|
36
|
-
#endif
|
37
|
-
|
38
|
-
#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
|
39
|
-
#include <android/log.h>
|
40
|
-
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
41
|
-
#undef LLAMA_LOG_INFO
|
42
|
-
#undef LLAMA_LOG_WARN
|
43
|
-
#undef LLAMA_LOG_ERROR
|
44
|
-
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
45
|
-
#define LLAMA_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
46
|
-
#define LLAMA_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, LLAMA_ANDROID_TAG, __VA_ARGS__)
|
47
|
-
#endif // __ANDROID__
|
48
|
-
|
49
|
-
#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
|
50
|
-
#include <android/log.h>
|
51
|
-
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
52
|
-
#undef LLAMA_LOG_INFO
|
53
|
-
#undef LLAMA_LOG_WARN
|
54
|
-
#undef LLAMA_LOG_ERROR
|
55
|
-
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
56
|
-
#define LLAMA_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
57
|
-
#define LLAMA_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, LLAMA_ANDROID_TAG, __VA_ARGS__)
|
58
|
-
#endif // __ANDROID__
|
59
|
-
|
60
|
-
//
|
61
|
-
// tensor loading (TODO: add llama_tesor_loader?)
|
62
|
-
//
|
63
|
-
|
64
|
-
static int llama_get_device_count(const llama_model & model) {
|
65
|
-
return (int) model.devices.size();
|
66
|
-
}
|
67
|
-
|
68
|
-
// checks if the weight tensor can be used with the specified buffer type and device
|
69
|
-
static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor * w, lm_ggml_op op, lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_dev_t dev) {
|
70
|
-
LM_GGML_ASSERT(w != nullptr);
|
71
|
-
|
72
|
-
if (op == LM_GGML_OP_NONE) {
|
73
|
-
return true;
|
74
|
-
}
|
75
|
-
|
76
|
-
lm_ggml_init_params params = {
|
77
|
-
/*.mem_size =*/ lm_ggml_tensor_overhead()*8,
|
78
|
-
/*.mem_buffer =*/ NULL,
|
79
|
-
/*.no_alloc =*/ true,
|
80
|
-
};
|
81
|
-
lm_ggml_context_ptr ctx_ptr { lm_ggml_init(params) };
|
82
|
-
if (!ctx_ptr) {
|
83
|
-
throw std::runtime_error(format("failed to create ggml context"));
|
84
|
-
}
|
85
|
-
lm_ggml_context * ctx = ctx_ptr.get();
|
86
|
-
|
87
|
-
lm_ggml_tensor * op_tensor = nullptr;
|
88
|
-
|
89
|
-
switch (op) {
|
90
|
-
case LM_GGML_OP_GET_ROWS:
|
91
|
-
{
|
92
|
-
lm_ggml_tensor * b = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, 512);
|
93
|
-
op_tensor = lm_ggml_get_rows(ctx, w, b);
|
94
|
-
} break;
|
95
|
-
case LM_GGML_OP_MUL_MAT:
|
96
|
-
{
|
97
|
-
lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
98
|
-
op_tensor = lm_ggml_mul_mat(ctx, w, b);
|
99
|
-
} break;
|
100
|
-
case LM_GGML_OP_MUL_MAT_ID:
|
101
|
-
{
|
102
|
-
int n_expert_used = hparams.n_expert_used;
|
103
|
-
lm_ggml_tensor * b = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
104
|
-
lm_ggml_tensor * ids = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_I32, n_expert_used, 512);
|
105
|
-
op_tensor = lm_ggml_mul_mat_id(ctx, w, b, ids);
|
106
|
-
} break;
|
107
|
-
case LM_GGML_OP_ADD:
|
108
|
-
{
|
109
|
-
lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
110
|
-
op_tensor = lm_ggml_add(ctx, a, w);
|
111
|
-
} break;
|
112
|
-
case LM_GGML_OP_MUL:
|
113
|
-
{
|
114
|
-
lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
115
|
-
op_tensor = lm_ggml_mul(ctx, a, w);
|
116
|
-
} break;
|
117
|
-
case LM_GGML_OP_DIV:
|
118
|
-
{
|
119
|
-
lm_ggml_tensor * a = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, w->ne[0]);
|
120
|
-
op_tensor = lm_ggml_div(ctx, a, w);
|
121
|
-
} break;
|
122
|
-
case LM_GGML_OP_ROPE:
|
123
|
-
{
|
124
|
-
int n_embd_head = hparams.n_embd_head_v;
|
125
|
-
int n_head = hparams.n_head();
|
126
|
-
lm_ggml_tensor * a = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, n_embd_head, n_head, 512);
|
127
|
-
lm_ggml_tensor * b = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, 512);
|
128
|
-
op_tensor = lm_ggml_rope_ext(
|
129
|
-
ctx, a, b, w,
|
130
|
-
0, 0, 0, 0, 0,
|
131
|
-
0, 0, 0, 0
|
132
|
-
);
|
133
|
-
|
134
|
-
} break;
|
135
|
-
case LM_GGML_OP_SSM_CONV:
|
136
|
-
{
|
137
|
-
// FIXME
|
138
|
-
lm_ggml_tensor * conv_x = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, 12345, w->ne[1], 6789);
|
139
|
-
op_tensor = lm_ggml_ssm_conv(ctx, conv_x, w);
|
140
|
-
} break;
|
141
|
-
case LM_GGML_OP_SSM_SCAN:
|
142
|
-
{
|
143
|
-
// FIXME
|
144
|
-
const int64_t d_state = w->ne[0];
|
145
|
-
const int64_t d_inner = w->ne[1];
|
146
|
-
const int64_t n_seq_tokens = 512;
|
147
|
-
const int64_t n_seqs = 1;
|
148
|
-
lm_ggml_tensor * s = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, d_inner, n_seqs);
|
149
|
-
lm_ggml_tensor * x = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
|
150
|
-
lm_ggml_tensor * dt = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
|
151
|
-
lm_ggml_tensor * B = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
|
152
|
-
lm_ggml_tensor * C = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
|
153
|
-
op_tensor = lm_ggml_ssm_scan(ctx, s, x, dt, w, B, C);
|
154
|
-
} break;
|
155
|
-
case LM_GGML_OP_RWKV_WKV6:
|
156
|
-
{
|
157
|
-
// FIXME
|
158
|
-
const int64_t S = 123;
|
159
|
-
const int64_t H = 123;
|
160
|
-
const int64_t n_tokens = 123;
|
161
|
-
const int64_t n_seqs = 123;
|
162
|
-
lm_ggml_tensor * k = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, 1, H, n_tokens);
|
163
|
-
lm_ggml_tensor * v = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
|
164
|
-
lm_ggml_tensor * r = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
|
165
|
-
lm_ggml_tensor * tf = w;
|
166
|
-
lm_ggml_tensor * td = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
|
167
|
-
lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
|
168
|
-
op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
169
|
-
} break;
|
170
|
-
case LM_GGML_OP_IM2COL:
|
171
|
-
{
|
172
|
-
const int n_embd = hparams.n_embd;
|
173
|
-
lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
174
|
-
op_tensor = lm_ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, LM_GGML_TYPE_F16);
|
175
|
-
} break;
|
176
|
-
default:
|
177
|
-
LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
|
178
|
-
}
|
179
|
-
|
180
|
-
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
181
|
-
LM_GGML_ASSERT(w->buffer == nullptr);
|
182
|
-
w->buffer = lm_ggml_backend_buft_alloc_buffer(buft, 0);
|
183
|
-
bool op_supported = lm_ggml_backend_dev_supports_op(dev, op_tensor);
|
184
|
-
lm_ggml_backend_buffer_free(w->buffer);
|
185
|
-
w->buffer = nullptr;
|
186
|
-
|
187
|
-
return op_supported;
|
188
|
-
}
|
189
|
-
|
190
|
-
// find the first buffer type in the list that can use the tensor
|
191
|
-
static lm_ggml_backend_buffer_type_t select_weight_buft(const llama_model & model, lm_ggml_tensor * tensor, lm_ggml_op op, const llama_model::buft_list_t & buft_list) {
|
192
|
-
LM_GGML_ASSERT(!buft_list.empty());
|
193
|
-
for (const auto & cur : buft_list) {
|
194
|
-
lm_ggml_backend_dev_t cur_dev = cur.first;
|
195
|
-
lm_ggml_backend_buffer_type_t cur_buft = cur.second;
|
196
|
-
if (weight_buft_supported(model.hparams, tensor, op, cur_buft, cur_dev)) {
|
197
|
-
return cur_buft;
|
198
|
-
}
|
199
|
-
}
|
200
|
-
return nullptr;
|
201
|
-
}
|
202
|
-
|
203
|
-
// CPU: ACCEL -> CPU extra -> GPU host -> CPU
|
204
|
-
static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
|
205
|
-
llama_model::buft_list_t buft_list;
|
206
|
-
|
207
|
-
// add ACCEL buffer types
|
208
|
-
for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
|
209
|
-
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
210
|
-
if (lm_ggml_backend_dev_type(dev) == LM_GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
211
|
-
auto * buft = lm_ggml_backend_dev_buffer_type(dev);
|
212
|
-
// skip
|
213
|
-
if (buft != lm_ggml_backend_cpu_buffer_type()) {
|
214
|
-
buft_list.emplace_back(dev, buft);
|
215
|
-
}
|
216
|
-
}
|
217
|
-
}
|
218
|
-
|
219
|
-
// add extra buffer types
|
220
|
-
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
221
|
-
auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
|
222
|
-
auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
|
223
|
-
lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
|
224
|
-
if (lm_ggml_backend_dev_get_extra_bufts_fn) {
|
225
|
-
lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
226
|
-
while (extra_bufts && *extra_bufts) {
|
227
|
-
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
228
|
-
++extra_bufts;
|
229
|
-
}
|
230
|
-
}
|
231
|
-
|
232
|
-
// add a host buffer type
|
233
|
-
// storing the tensors in a host buffer is useful when the processing of large batches
|
234
|
-
// is offloaded to a GPU device, since it reduces the time spent on data transfers
|
235
|
-
// generally, this will be done using the first device in the list
|
236
|
-
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
|
237
|
-
// function of the device to determine if it would benefit from being stored in a host buffer
|
238
|
-
for (auto * dev : model.devices) {
|
239
|
-
lm_ggml_backend_buffer_type_t buft = lm_ggml_backend_dev_host_buffer_type(dev);
|
240
|
-
if (buft) {
|
241
|
-
buft_list.emplace_back(dev, buft);
|
242
|
-
break;
|
243
|
-
}
|
244
|
-
}
|
245
|
-
|
246
|
-
// add the CPU buffer type
|
247
|
-
for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
|
248
|
-
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
249
|
-
if (lm_ggml_backend_dev_type(dev) == LM_GGML_BACKEND_DEVICE_TYPE_CPU) {
|
250
|
-
buft_list.emplace_back(dev, lm_ggml_backend_dev_buffer_type(dev));
|
251
|
-
}
|
252
|
-
}
|
253
|
-
|
254
|
-
return buft_list;
|
255
|
-
}
|
256
|
-
|
257
|
-
// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
|
258
|
-
static llama_model::buft_list_t make_gpu_buft_list(lm_ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
|
259
|
-
llama_model::buft_list_t buft_list;
|
260
|
-
|
261
|
-
// add the device split buffer type if requested and available
|
262
|
-
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
263
|
-
lm_ggml_backend_reg_t reg = lm_ggml_backend_dev_backend_reg(dev);
|
264
|
-
auto lm_ggml_backend_split_buffer_type_fn = (lm_ggml_backend_split_buffer_type_t)
|
265
|
-
lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_split_buffer_type");
|
266
|
-
if (lm_ggml_backend_split_buffer_type_fn) {
|
267
|
-
size_t dev_index = [&]() {
|
268
|
-
auto * reg = lm_ggml_backend_dev_backend_reg(dev);
|
269
|
-
for (size_t i = 0; i < lm_ggml_backend_reg_dev_count(reg); ++i) {
|
270
|
-
if (lm_ggml_backend_reg_dev_get(reg, i) == dev) {
|
271
|
-
return i;
|
272
|
-
}
|
273
|
-
}
|
274
|
-
throw std::runtime_error(format("device %s not found in its backend reg", lm_ggml_backend_dev_name(dev)));
|
275
|
-
}();
|
276
|
-
auto * buft = lm_ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
|
277
|
-
if (buft != nullptr) {
|
278
|
-
buft_list.emplace_back(dev, buft);
|
279
|
-
}
|
280
|
-
}
|
281
|
-
}
|
282
|
-
|
283
|
-
// add the device default buffer type
|
284
|
-
buft_list.emplace_back(dev, lm_ggml_backend_dev_buffer_type(dev));
|
285
|
-
|
286
|
-
return buft_list;
|
287
|
-
}
|
288
|
-
|
289
|
-
// Returns false if cancelled by progress_callback
|
290
|
-
static bool llm_load_tensors(
|
291
|
-
llama_model_loader & ml,
|
292
|
-
llama_model & model,
|
293
|
-
int n_gpu_layers,
|
294
|
-
enum llama_split_mode split_mode,
|
295
|
-
int main_gpu,
|
296
|
-
const float * tensor_split,
|
297
|
-
bool use_mlock,
|
298
|
-
llama_progress_callback progress_callback,
|
299
|
-
void * progress_callback_user_data) {
|
300
|
-
auto & hparams = model.hparams;
|
301
|
-
|
302
|
-
model.split_mode = split_mode;
|
303
|
-
model.main_gpu = main_gpu;
|
304
|
-
model.n_gpu_layers = n_gpu_layers;
|
305
|
-
|
306
|
-
const int n_layer = hparams.n_layer;
|
307
|
-
|
308
|
-
bool use_mmap_buffer = true;
|
309
|
-
|
310
|
-
// build a list of buffer types for the CPU and GPU devices
|
311
|
-
model.cpu_buft_list = make_cpu_buft_list(model);
|
312
|
-
for (auto * dev : model.devices) {
|
313
|
-
llama_model::buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
314
|
-
// add CPU buffer types as a fallback
|
315
|
-
buft_list.insert(buft_list.end(), model.cpu_buft_list.begin(), model.cpu_buft_list.end());
|
316
|
-
model.gpu_buft_list.emplace(dev, std::move(buft_list));
|
317
|
-
}
|
318
|
-
|
319
|
-
// calculate the split points
|
320
|
-
int device_count = llama_get_device_count(model);
|
321
|
-
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
322
|
-
std::vector<float> splits(device_count);
|
323
|
-
if (all_zero) {
|
324
|
-
// default split, by free memory
|
325
|
-
for (int i = 0; i < device_count; ++i) {
|
326
|
-
lm_ggml_backend_dev_t dev = model.devices[i];
|
327
|
-
size_t total;
|
328
|
-
size_t free;
|
329
|
-
lm_ggml_backend_dev_memory(dev, &free, &total);
|
330
|
-
splits[i] = free;
|
331
|
-
}
|
332
|
-
} else {
|
333
|
-
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
334
|
-
}
|
335
|
-
|
336
|
-
// sum and normalize the splits to get the split points
|
337
|
-
float split_sum = 0.0f;
|
338
|
-
for (int i = 0; i < device_count; ++i) {
|
339
|
-
split_sum += splits[i];
|
340
|
-
splits[i] = split_sum;
|
341
|
-
}
|
342
|
-
for (int i = 0; i < device_count; ++i) {
|
343
|
-
splits[i] /= split_sum;
|
344
|
-
}
|
345
|
-
|
346
|
-
lm_ggml_backend_dev_t cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
347
|
-
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
348
|
-
const int act_gpu_layers = model.devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
349
|
-
auto get_layer_buft_list = [&](int il) -> llama_model::layer_dev {
|
350
|
-
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
351
|
-
return {cpu_dev, &model.cpu_buft_list};
|
352
|
-
}
|
353
|
-
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
354
|
-
auto * dev = model.devices.at(layer_gpu);
|
355
|
-
return {dev, &model.gpu_buft_list.at(dev)};
|
356
|
-
};
|
357
|
-
|
358
|
-
// assign the input layer
|
359
|
-
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
360
|
-
model.dev_input = { cpu_dev, &model.cpu_buft_list };
|
361
|
-
|
362
|
-
// assign the repeating layers to the devices according to the splits
|
363
|
-
model.dev_layer.resize(n_layer);
|
364
|
-
for (int il = 0; il < n_layer; ++il) {
|
365
|
-
model.dev_layer[il] = get_layer_buft_list(il);
|
366
|
-
}
|
367
|
-
// assign the output layer
|
368
|
-
model.dev_output = get_layer_buft_list(n_layer);
|
369
|
-
|
370
|
-
// one ggml context per buffer type
|
371
|
-
int max_n_tensors = ml.n_tensors;
|
372
|
-
max_n_tensors += 1; // duplicated output tensor
|
373
|
-
max_n_tensors += n_layer*2; // duplicated rope freq tensors
|
374
|
-
const size_t ctx_size = lm_ggml_tensor_overhead()*max_n_tensors;
|
375
|
-
|
376
|
-
std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
|
377
|
-
auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
|
378
|
-
auto it = ctx_map.find(buft);
|
379
|
-
if (it == ctx_map.end()) {
|
380
|
-
lm_ggml_init_params params = {
|
381
|
-
/*.mem_size =*/ ctx_size,
|
382
|
-
/*.mem_buffer =*/ NULL,
|
383
|
-
/*.no_alloc =*/ true,
|
384
|
-
};
|
385
|
-
lm_ggml_context * ctx = lm_ggml_init(params);
|
386
|
-
if (!ctx) {
|
387
|
-
throw std::runtime_error(format("failed to create ggml context"));
|
388
|
-
}
|
389
|
-
ctx_map[buft] = ctx;
|
390
|
-
model.ctxs.emplace_back(ctx);
|
391
|
-
return ctx;
|
392
|
-
}
|
393
|
-
return it->second;
|
394
|
-
};
|
395
|
-
|
396
|
-
// create tensors for the weights
|
397
|
-
{
|
398
|
-
// note: cast to int64_t since we will use these for the tensor dimensions
|
399
|
-
const int64_t n_head = hparams.n_head();
|
400
|
-
const int64_t n_head_kv = hparams.n_head_kv();
|
401
|
-
const int64_t n_embd = hparams.n_embd;
|
402
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
403
|
-
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
404
|
-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
405
|
-
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
406
|
-
const int64_t n_ff = hparams.n_ff();
|
407
|
-
const int64_t n_embd_gqa = n_embd_v_gqa;
|
408
|
-
const int64_t n_vocab = hparams.n_vocab;
|
409
|
-
const int64_t n_vocab_type = hparams.n_vocab_type;
|
410
|
-
const int64_t n_rot = hparams.n_rot;
|
411
|
-
const int64_t n_expert = hparams.n_expert;
|
412
|
-
const int64_t n_expert_used = hparams.n_expert_used;
|
413
|
-
const int64_t n_ctx_train = hparams.n_ctx_train;
|
414
|
-
|
415
|
-
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
416
|
-
throw std::runtime_error("model has expert layers but no expert layers are used");
|
417
|
-
}
|
418
|
-
|
419
|
-
int n_moved_tensors = 0;
|
420
|
-
lm_ggml_tensor * first_moved_tensor = nullptr;
|
421
|
-
lm_ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
422
|
-
lm_ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
423
|
-
|
424
|
-
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> lm_ggml_tensor * {
|
425
|
-
lm_ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
|
426
|
-
|
427
|
-
if (!t_meta) {
|
428
|
-
if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) {
|
429
|
-
return nullptr;
|
430
|
-
}
|
431
|
-
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
432
|
-
}
|
433
|
-
|
434
|
-
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
435
|
-
// the tensor is duplicated
|
436
|
-
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
437
|
-
llm_tensor tn_tensor = tn.tensor;
|
438
|
-
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & llama_model_loader::TENSOR_DUPLICATED) {
|
439
|
-
tn_tensor = LLM_TENSOR_OUTPUT;
|
440
|
-
}
|
441
|
-
|
442
|
-
llm_tensor_info info;
|
443
|
-
try {
|
444
|
-
info = llm_tensor_info_for(tn_tensor);
|
445
|
-
} catch (const std::out_of_range & e) {
|
446
|
-
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
447
|
-
}
|
448
|
-
|
449
|
-
// tensors with "bias" suffix are always used with LM_GGML_OP_ADD
|
450
|
-
lm_ggml_op op;
|
451
|
-
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
452
|
-
if (bias) {
|
453
|
-
op = LM_GGML_OP_ADD;
|
454
|
-
} else {
|
455
|
-
op = info.op;
|
456
|
-
}
|
457
|
-
|
458
|
-
// sanity checks
|
459
|
-
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
460
|
-
if (tn.bid != -1) {
|
461
|
-
LM_GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
462
|
-
}
|
463
|
-
} else {
|
464
|
-
if (tn.bid == -1) {
|
465
|
-
LM_GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
466
|
-
}
|
467
|
-
}
|
468
|
-
|
469
|
-
// select the buffer type for this tensor
|
470
|
-
llama_model::buft_list_t * buft_list;
|
471
|
-
switch (info.layer) {
|
472
|
-
case LLM_TENSOR_LAYER_INPUT:
|
473
|
-
buft_list = model.dev_input.buft_list;
|
474
|
-
break;
|
475
|
-
case LLM_TENSOR_LAYER_OUTPUT:
|
476
|
-
buft_list = model.dev_output.buft_list;
|
477
|
-
break;
|
478
|
-
case LLM_TENSOR_LAYER_REPEATING:
|
479
|
-
buft_list = model.dev_layer.at(tn.bid).buft_list;
|
480
|
-
break;
|
481
|
-
default:
|
482
|
-
LM_GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
483
|
-
}
|
484
|
-
|
485
|
-
lm_ggml_backend_buffer_type_t buft = select_weight_buft(model, t_meta, op, *buft_list);
|
486
|
-
if (!buft) {
|
487
|
-
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
488
|
-
}
|
489
|
-
|
490
|
-
// avoid using a host buffer when using mmap
|
491
|
-
auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
|
492
|
-
if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
|
493
|
-
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
494
|
-
buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
|
495
|
-
}
|
496
|
-
|
497
|
-
if (buft != buft_list->front().second) {
|
498
|
-
n_moved_tensors++;
|
499
|
-
if (!first_moved_tensor) {
|
500
|
-
first_moved_tensor = t_meta;
|
501
|
-
first_moved_from_buft = buft_list->front().second;
|
502
|
-
first_moved_to_buft = buft;
|
503
|
-
}
|
504
|
-
}
|
505
|
-
|
506
|
-
lm_ggml_context * ctx = ctx_for_buft(buft);
|
507
|
-
|
508
|
-
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
509
|
-
if (flags & llama_model_loader::TENSOR_DUPLICATED) {
|
510
|
-
lm_ggml_tensor * t = lm_ggml_get_tensor(ctx, tn.str().c_str());
|
511
|
-
if (t) {
|
512
|
-
return t;
|
513
|
-
}
|
514
|
-
}
|
515
|
-
return ml.create_tensor(ctx, tn, ne, flags);
|
516
|
-
};
|
517
|
-
|
518
|
-
model.layers.resize(n_layer);
|
519
|
-
|
520
|
-
// TODO: move to a separate function
|
521
|
-
const auto tn = LLM_TN(model.arch);
|
522
|
-
switch (model.arch) {
|
523
|
-
case LLM_ARCH_LLAMA:
|
524
|
-
case LLM_ARCH_REFACT:
|
525
|
-
case LLM_ARCH_MINICPM:
|
526
|
-
case LLM_ARCH_GRANITE:
|
527
|
-
case LLM_ARCH_GRANITE_MOE:
|
528
|
-
{
|
529
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
530
|
-
|
531
|
-
// output
|
532
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
533
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
534
|
-
|
535
|
-
// if output is NULL, init from the input tok embed
|
536
|
-
if (model.output == NULL) {
|
537
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
538
|
-
}
|
539
|
-
|
540
|
-
for (int i = 0; i < n_layer; ++i) {
|
541
|
-
auto & layer = model.layers[i];
|
542
|
-
|
543
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
544
|
-
|
545
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
546
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
547
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
548
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
549
|
-
|
550
|
-
// optional bias tensors
|
551
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
552
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
553
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
554
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
555
|
-
|
556
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
557
|
-
|
558
|
-
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
559
|
-
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
560
|
-
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
561
|
-
}
|
562
|
-
else {
|
563
|
-
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
564
|
-
}
|
565
|
-
|
566
|
-
if (n_expert == 0) {
|
567
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
568
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
569
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
570
|
-
|
571
|
-
// optional MLP bias
|
572
|
-
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
573
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
574
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
575
|
-
} else {
|
576
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
577
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
578
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
579
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
580
|
-
}
|
581
|
-
}
|
582
|
-
} break;
|
583
|
-
case LLM_ARCH_DECI:
|
584
|
-
{
|
585
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
586
|
-
|
587
|
-
// output
|
588
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
589
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
590
|
-
|
591
|
-
// if output is NULL, init from the input tok embed
|
592
|
-
if (model.output == NULL) {
|
593
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
594
|
-
}
|
595
|
-
|
596
|
-
for (int i = 0; i < n_layer; ++i) {
|
597
|
-
auto & layer = model.layers[i];
|
598
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
599
|
-
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
600
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
601
|
-
const int64_t n_ff = hparams.n_ff(i);
|
602
|
-
const int64_t n_head = hparams.n_head(i);
|
603
|
-
const int64_t n_head_kv = hparams.n_head_kv(i);
|
604
|
-
|
605
|
-
if (n_head_kv == 0 && n_head > 0) {
|
606
|
-
// linear attention for DeciLMCausalModel
|
607
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
608
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
609
|
-
}
|
610
|
-
else if (n_head_kv > 0) {
|
611
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
612
|
-
|
613
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
614
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
615
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
616
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
617
|
-
}
|
618
|
-
|
619
|
-
// optional bias tensors
|
620
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
621
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
622
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
623
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
624
|
-
|
625
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
626
|
-
|
627
|
-
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
628
|
-
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
629
|
-
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
630
|
-
}
|
631
|
-
else {
|
632
|
-
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
633
|
-
}
|
634
|
-
|
635
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
636
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
637
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
638
|
-
|
639
|
-
// optional MLP bias
|
640
|
-
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
641
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
642
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
643
|
-
}
|
644
|
-
} break;
|
645
|
-
case LLM_ARCH_MINICPM3:
|
646
|
-
{
|
647
|
-
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
648
|
-
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
649
|
-
|
650
|
-
const int64_t q_lora_rank = hparams.n_lora_q;
|
651
|
-
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
652
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
653
|
-
|
654
|
-
// output
|
655
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
656
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
657
|
-
|
658
|
-
// if output is NULL, init from the input tok embed
|
659
|
-
if (model.output == NULL) {
|
660
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
661
|
-
}
|
662
|
-
|
663
|
-
for (int i = 0; i < n_layer; ++i) {
|
664
|
-
auto & layer = model.layers[i];
|
665
|
-
|
666
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
667
|
-
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
668
|
-
|
669
|
-
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
670
|
-
|
671
|
-
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
672
|
-
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
|
673
|
-
|
674
|
-
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
675
|
-
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
676
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
677
|
-
|
678
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
679
|
-
|
680
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
681
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
682
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
683
|
-
|
684
|
-
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
685
|
-
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
686
|
-
}
|
687
|
-
} break;
|
688
|
-
case LLM_ARCH_GROK:
|
689
|
-
{
|
690
|
-
if (n_expert == 0) {
|
691
|
-
throw std::runtime_error("Grok model cannot have zero experts");
|
692
|
-
}
|
693
|
-
|
694
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
695
|
-
|
696
|
-
// output
|
697
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
698
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
699
|
-
|
700
|
-
// if output is NULL, init from the input tok embed
|
701
|
-
if (model.output == NULL) {
|
702
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
703
|
-
}
|
704
|
-
|
705
|
-
for (int i = 0; i < n_layer; ++i) {
|
706
|
-
auto & layer = model.layers[i];
|
707
|
-
|
708
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
709
|
-
|
710
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
711
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
712
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
713
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
714
|
-
|
715
|
-
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
716
|
-
|
717
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
718
|
-
|
719
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
720
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
721
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
722
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
723
|
-
|
724
|
-
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
725
|
-
}
|
726
|
-
} break;
|
727
|
-
case LLM_ARCH_DBRX:
|
728
|
-
{
|
729
|
-
if (n_expert == 0) {
|
730
|
-
throw std::runtime_error("DBRX model cannot have zero experts");
|
731
|
-
}
|
732
|
-
|
733
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
734
|
-
|
735
|
-
// output
|
736
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
737
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
738
|
-
|
739
|
-
for (int i = 0; i < n_layer; ++i) {
|
740
|
-
auto & layer = model.layers[i];
|
741
|
-
|
742
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
743
|
-
|
744
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
745
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
746
|
-
|
747
|
-
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
748
|
-
|
749
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
750
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
751
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
752
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
753
|
-
}
|
754
|
-
} break;
|
755
|
-
case LLM_ARCH_BAICHUAN:
|
756
|
-
{
|
757
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
758
|
-
{
|
759
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
760
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
761
|
-
}
|
762
|
-
|
763
|
-
for (int i = 0; i < n_layer; ++i) {
|
764
|
-
auto & layer = model.layers[i];
|
765
|
-
|
766
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
767
|
-
|
768
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
769
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
770
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
771
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
772
|
-
|
773
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
774
|
-
|
775
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
776
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
777
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
778
|
-
}
|
779
|
-
} break;
|
780
|
-
case LLM_ARCH_FALCON:
|
781
|
-
{
|
782
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
783
|
-
|
784
|
-
// output
|
785
|
-
{
|
786
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
787
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
788
|
-
|
789
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
790
|
-
if (!model.output) {
|
791
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
792
|
-
}
|
793
|
-
}
|
794
|
-
|
795
|
-
for (int i = 0; i < n_layer; ++i) {
|
796
|
-
auto & layer = model.layers[i];
|
797
|
-
|
798
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
799
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
800
|
-
|
801
|
-
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
802
|
-
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
803
|
-
|
804
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
805
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
806
|
-
|
807
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
808
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
809
|
-
}
|
810
|
-
} break;
|
811
|
-
case LLM_ARCH_STARCODER:
|
812
|
-
{
|
813
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
814
|
-
model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
815
|
-
|
816
|
-
// output
|
817
|
-
{
|
818
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
819
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
820
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
821
|
-
if (!model.output) {
|
822
|
-
// needs to be on GPU
|
823
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
824
|
-
}
|
825
|
-
|
826
|
-
}
|
827
|
-
|
828
|
-
for (int i = 0; i < n_layer; ++i) {
|
829
|
-
auto & layer = model.layers[i];
|
830
|
-
|
831
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
832
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
833
|
-
|
834
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
835
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
836
|
-
|
837
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
838
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
839
|
-
|
840
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
841
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
842
|
-
|
843
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
844
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
845
|
-
|
846
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
847
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
848
|
-
}
|
849
|
-
} break;
|
850
|
-
case LLM_ARCH_BERT:
|
851
|
-
case LLM_ARCH_NOMIC_BERT:
|
852
|
-
{
|
853
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
854
|
-
model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0);
|
855
|
-
|
856
|
-
if (model.arch == LLM_ARCH_BERT) {
|
857
|
-
model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
858
|
-
|
859
|
-
model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
860
|
-
model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
861
|
-
|
862
|
-
model.cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
863
|
-
model.cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
864
|
-
}
|
865
|
-
|
866
|
-
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
867
|
-
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
|
868
|
-
|
869
|
-
for (int i = 0; i < n_layer; ++i) {
|
870
|
-
auto & layer = model.layers[i];
|
871
|
-
|
872
|
-
if (model.arch == LLM_ARCH_BERT) {
|
873
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
874
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
875
|
-
|
876
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
877
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
878
|
-
|
879
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
880
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
881
|
-
} else {
|
882
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
883
|
-
}
|
884
|
-
|
885
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
886
|
-
|
887
|
-
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
888
|
-
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
889
|
-
|
890
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
891
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
892
|
-
|
893
|
-
if (model.arch == LLM_ARCH_BERT) {
|
894
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
895
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
896
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
897
|
-
} else {
|
898
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
899
|
-
}
|
900
|
-
|
901
|
-
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
902
|
-
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
903
|
-
}
|
904
|
-
} break;
|
905
|
-
case LLM_ARCH_JINA_BERT_V2:
|
906
|
-
{
|
907
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
908
|
-
model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); // token_type_embeddings
|
909
|
-
|
910
|
-
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
|
911
|
-
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
|
912
|
-
|
913
|
-
model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
914
|
-
model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
915
|
-
for (int i = 0; i < n_layer; ++i) {
|
916
|
-
auto & layer = model.layers[i]; // JinaBertLayer
|
917
|
-
|
918
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
919
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
920
|
-
|
921
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
922
|
-
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
923
|
-
|
924
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
925
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
926
|
-
|
927
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
928
|
-
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
929
|
-
|
930
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
931
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
932
|
-
|
933
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
|
934
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
|
935
|
-
|
936
|
-
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
|
937
|
-
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
938
|
-
|
939
|
-
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
940
|
-
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
941
|
-
|
942
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
943
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
944
|
-
|
945
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
946
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
947
|
-
|
948
|
-
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
949
|
-
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
950
|
-
}
|
951
|
-
} break;
|
952
|
-
case LLM_ARCH_BLOOM:
|
953
|
-
{
|
954
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
955
|
-
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
956
|
-
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
|
957
|
-
|
958
|
-
// output
|
959
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
960
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
961
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
962
|
-
|
963
|
-
for (int i = 0; i < n_layer; ++i) {
|
964
|
-
auto & layer = model.layers[i];
|
965
|
-
|
966
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
967
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
968
|
-
|
969
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
970
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
971
|
-
|
972
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
973
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
974
|
-
|
975
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
976
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
977
|
-
|
978
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
979
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
980
|
-
|
981
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
982
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
983
|
-
}
|
984
|
-
} break;
|
985
|
-
case LLM_ARCH_MPT:
|
986
|
-
{
|
987
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
988
|
-
model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
989
|
-
|
990
|
-
// output
|
991
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
992
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
993
|
-
|
994
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
995
|
-
if (!model.output) {
|
996
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
997
|
-
}
|
998
|
-
|
999
|
-
for (int i = 0; i < n_layer; ++i) {
|
1000
|
-
auto & layer = model.layers[i];
|
1001
|
-
|
1002
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1003
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1004
|
-
|
1005
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
1006
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1007
|
-
|
1008
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1009
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1010
|
-
|
1011
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1012
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1013
|
-
|
1014
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
1015
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1016
|
-
|
1017
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1018
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1019
|
-
|
1020
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1021
|
-
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1022
|
-
|
1023
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1024
|
-
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1025
|
-
|
1026
|
-
// AWQ ScaleActivation layer
|
1027
|
-
layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1028
|
-
}
|
1029
|
-
} break;
|
1030
|
-
case LLM_ARCH_STABLELM:
|
1031
|
-
{
|
1032
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1033
|
-
|
1034
|
-
// output
|
1035
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
1036
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1037
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1038
|
-
|
1039
|
-
for (int i = 0; i < n_layer; ++i) {
|
1040
|
-
auto & layer = model.layers[i];
|
1041
|
-
|
1042
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1043
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
1044
|
-
|
1045
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1046
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1047
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1048
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1049
|
-
|
1050
|
-
// optional bias tensors, present in Stable LM 2 1.6B
|
1051
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1052
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1053
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1054
|
-
|
1055
|
-
// optional q and k layernorms, present in StableLM 2 12B
|
1056
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1057
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1058
|
-
|
1059
|
-
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
1060
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1061
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1062
|
-
|
1063
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1064
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1065
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1066
|
-
}
|
1067
|
-
} break;
|
1068
|
-
case LLM_ARCH_QWEN:
|
1069
|
-
{
|
1070
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1071
|
-
|
1072
|
-
// output
|
1073
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1074
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1075
|
-
|
1076
|
-
for (int i = 0; i < n_layer; ++i) {
|
1077
|
-
auto & layer = model.layers[i];
|
1078
|
-
|
1079
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1080
|
-
|
1081
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
|
1082
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
|
1083
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1084
|
-
|
1085
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1086
|
-
|
1087
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
|
1088
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
|
1089
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
|
1090
|
-
}
|
1091
|
-
} break;
|
1092
|
-
case LLM_ARCH_QWEN2:
|
1093
|
-
case LLM_ARCH_QWEN2VL:
|
1094
|
-
{
|
1095
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1096
|
-
|
1097
|
-
// output
|
1098
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1099
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1100
|
-
// if output is NULL, init from the input tok embed
|
1101
|
-
if (model.output == NULL) {
|
1102
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1103
|
-
}
|
1104
|
-
|
1105
|
-
for (int i = 0; i < n_layer; ++i) {
|
1106
|
-
auto & layer = model.layers[i];
|
1107
|
-
|
1108
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1109
|
-
|
1110
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1111
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1112
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1113
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1114
|
-
|
1115
|
-
// optional bias tensors
|
1116
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
1117
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
1118
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
1119
|
-
|
1120
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1121
|
-
|
1122
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1123
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1124
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1125
|
-
}
|
1126
|
-
} break;
|
1127
|
-
case LLM_ARCH_QWEN2MOE:
|
1128
|
-
{
|
1129
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1130
|
-
|
1131
|
-
// output
|
1132
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1133
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1134
|
-
|
1135
|
-
for (int i = 0; i < n_layer; ++i) {
|
1136
|
-
auto & layer = model.layers[i];
|
1137
|
-
|
1138
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1139
|
-
|
1140
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1141
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1142
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1143
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1144
|
-
|
1145
|
-
// optional bias tensors
|
1146
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
1147
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
1148
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
1149
|
-
|
1150
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1151
|
-
|
1152
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
1153
|
-
|
1154
|
-
if (n_expert == 0) {
|
1155
|
-
throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
|
1156
|
-
}
|
1157
|
-
if (n_expert_used == 0) {
|
1158
|
-
throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
|
1159
|
-
}
|
1160
|
-
|
1161
|
-
// MoE branch
|
1162
|
-
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
1163
|
-
|
1164
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
1165
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
1166
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
1167
|
-
|
1168
|
-
// Shared expert branch
|
1169
|
-
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
1170
|
-
|
1171
|
-
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
|
1172
|
-
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
1173
|
-
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
1174
|
-
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
1175
|
-
}
|
1176
|
-
} break;
|
1177
|
-
case LLM_ARCH_PHI2:
|
1178
|
-
{
|
1179
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1180
|
-
|
1181
|
-
// output
|
1182
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1183
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
1184
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1185
|
-
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
|
1186
|
-
|
1187
|
-
for (int i = 0; i < n_layer; ++i) {
|
1188
|
-
auto & layer = model.layers[i];
|
1189
|
-
|
1190
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1191
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
1192
|
-
|
1193
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1194
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1195
|
-
|
1196
|
-
if (layer.wqkv == nullptr) {
|
1197
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1198
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
1199
|
-
|
1200
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1201
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
1202
|
-
|
1203
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1204
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
1205
|
-
}
|
1206
|
-
|
1207
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1208
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
1209
|
-
|
1210
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
1211
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
1212
|
-
|
1213
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1214
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
1215
|
-
}
|
1216
|
-
} break;
|
1217
|
-
case LLM_ARCH_PHI3:
|
1218
|
-
{
|
1219
|
-
const int64_t n_embd_head = n_embd / n_head;
|
1220
|
-
|
1221
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
1222
|
-
|
1223
|
-
// output
|
1224
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
1225
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
|
1226
|
-
|
1227
|
-
for (int i = 0; i < n_layer; ++i) {
|
1228
|
-
auto & layer = model.layers[i];
|
1229
|
-
|
1230
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
1231
|
-
|
1232
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1233
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
1234
|
-
|
1235
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
1236
|
-
|
1237
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
1238
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
|
1239
|
-
|
1240
|
-
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
1241
|
-
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
1242
|
-
}
|
1243
|
-
} break;
|
1244
|
-
case LLM_ARCH_PLAMO:
|
1245
|
-
{
|
1246
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1247
|
-
|
1248
|
-
// output
|
1249
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1250
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1251
|
-
|
1252
|
-
for (int i = 0; i < n_layer; ++i) {
|
1253
|
-
auto & layer = model.layers[i];
|
1254
|
-
|
1255
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1256
|
-
|
1257
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1258
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1259
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1260
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1261
|
-
|
1262
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1263
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1264
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1265
|
-
}
|
1266
|
-
} break;
|
1267
|
-
case LLM_ARCH_GPT2:
|
1268
|
-
{
|
1269
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1270
|
-
model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
1271
|
-
|
1272
|
-
// output
|
1273
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1274
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
1275
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1276
|
-
|
1277
|
-
for (int i = 0; i < n_layer; ++i) {
|
1278
|
-
auto & layer = model.layers[i];
|
1279
|
-
|
1280
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1281
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
1282
|
-
|
1283
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
1284
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
1285
|
-
|
1286
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1287
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
1288
|
-
|
1289
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1290
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
1291
|
-
|
1292
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
1293
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
1294
|
-
|
1295
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1296
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
1297
|
-
}
|
1298
|
-
} break;
|
1299
|
-
case LLM_ARCH_CODESHELL:
|
1300
|
-
{
|
1301
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1302
|
-
|
1303
|
-
// output
|
1304
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1305
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
1306
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1307
|
-
|
1308
|
-
for (int i = 0; i < n_layer; ++i) {
|
1309
|
-
auto & layer = model.layers[i];
|
1310
|
-
|
1311
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1312
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
1313
|
-
|
1314
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
1315
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
1316
|
-
|
1317
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1318
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
1319
|
-
|
1320
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1321
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
1322
|
-
|
1323
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
1324
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
1325
|
-
|
1326
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1327
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
1328
|
-
}
|
1329
|
-
} break;
|
1330
|
-
case LLM_ARCH_ORION:
|
1331
|
-
{
|
1332
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1333
|
-
|
1334
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1335
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
1336
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1337
|
-
|
1338
|
-
for (int i = 0; i < n_layer; ++i) {
|
1339
|
-
auto & layer = model.layers[i];
|
1340
|
-
|
1341
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1342
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
1343
|
-
|
1344
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1345
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1346
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1347
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1348
|
-
|
1349
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1350
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
1351
|
-
|
1352
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1353
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1354
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1355
|
-
}
|
1356
|
-
} break;
|
1357
|
-
case LLM_ARCH_INTERNLM2:
|
1358
|
-
{
|
1359
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1360
|
-
|
1361
|
-
// output
|
1362
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1363
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1364
|
-
|
1365
|
-
for (int i = 0; i < n_layer; ++i) {
|
1366
|
-
auto & layer = model.layers[i];
|
1367
|
-
|
1368
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1369
|
-
// layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
1370
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1371
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1372
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1373
|
-
|
1374
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1375
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1376
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1377
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1378
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1379
|
-
}
|
1380
|
-
} break;
|
1381
|
-
case LLM_ARCH_GEMMA:
|
1382
|
-
{
|
1383
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1384
|
-
|
1385
|
-
// output
|
1386
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1387
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
1388
|
-
|
1389
|
-
for (int i = 0; i < n_layer; ++i) {
|
1390
|
-
auto & layer = model.layers[i];
|
1391
|
-
|
1392
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1393
|
-
|
1394
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
1395
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1396
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
1397
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
1398
|
-
|
1399
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1400
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1401
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1402
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1403
|
-
}
|
1404
|
-
} break;
|
1405
|
-
case LLM_ARCH_GEMMA2:
|
1406
|
-
{
|
1407
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1408
|
-
|
1409
|
-
// output
|
1410
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1411
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
1412
|
-
|
1413
|
-
for (int i = 0; i < n_layer; ++i) {
|
1414
|
-
auto & layer = model.layers[i];
|
1415
|
-
|
1416
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1417
|
-
|
1418
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
1419
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1420
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
1421
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
1422
|
-
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
1423
|
-
|
1424
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1425
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1426
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1427
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1428
|
-
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
1429
|
-
}
|
1430
|
-
} break;
|
1431
|
-
case LLM_ARCH_STARCODER2:
|
1432
|
-
{
|
1433
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1434
|
-
|
1435
|
-
// output
|
1436
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1437
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
1438
|
-
|
1439
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1440
|
-
// if output is NULL, init from the input tok embed
|
1441
|
-
if (model.output == NULL) {
|
1442
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1443
|
-
}
|
1444
|
-
|
1445
|
-
for (int i = 0; i < n_layer; ++i) {
|
1446
|
-
auto & layer = model.layers[i];
|
1447
|
-
|
1448
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1449
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
1450
|
-
|
1451
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1452
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1453
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1454
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1455
|
-
|
1456
|
-
// optional bias tensors
|
1457
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
1458
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
1459
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
1460
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
1461
|
-
|
1462
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1463
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
1464
|
-
|
1465
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1466
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1467
|
-
|
1468
|
-
// optional bias tensors
|
1469
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
1470
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
|
1471
|
-
}
|
1472
|
-
} break;
|
1473
|
-
case LLM_ARCH_MAMBA:
|
1474
|
-
{
|
1475
|
-
const int64_t d_conv = hparams.ssm_d_conv;
|
1476
|
-
const int64_t d_inner = hparams.ssm_d_inner;
|
1477
|
-
const int64_t d_state = hparams.ssm_d_state;
|
1478
|
-
const int64_t dt_rank = hparams.ssm_dt_rank;
|
1479
|
-
|
1480
|
-
// only an expansion factor of 2 is supported for now
|
1481
|
-
if (2 * n_embd != d_inner) {
|
1482
|
-
throw std::runtime_error("only an expansion factor of 2 is supported for now");
|
1483
|
-
}
|
1484
|
-
|
1485
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1486
|
-
|
1487
|
-
// output
|
1488
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1489
|
-
|
1490
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1491
|
-
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
1492
|
-
if (model.output == NULL) {
|
1493
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1494
|
-
}
|
1495
|
-
|
1496
|
-
for (int i = 0; i < n_layer; ++i) {
|
1497
|
-
auto & layer = model.layers[i];
|
1498
|
-
|
1499
|
-
// norm
|
1500
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1501
|
-
|
1502
|
-
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
|
1503
|
-
|
1504
|
-
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
|
1505
|
-
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
|
1506
|
-
|
1507
|
-
layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
|
1508
|
-
|
1509
|
-
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
|
1510
|
-
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
|
1511
|
-
|
1512
|
-
// no "weight" suffix for these
|
1513
|
-
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
|
1514
|
-
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
|
1515
|
-
|
1516
|
-
// out_proj
|
1517
|
-
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
1518
|
-
}
|
1519
|
-
} break;
|
1520
|
-
case LLM_ARCH_XVERSE:
|
1521
|
-
{
|
1522
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1523
|
-
|
1524
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1525
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1526
|
-
|
1527
|
-
for (int i = 0; i < n_layer; ++i) {
|
1528
|
-
auto & layer = model.layers[i];
|
1529
|
-
|
1530
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1531
|
-
|
1532
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1533
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1534
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1535
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1536
|
-
|
1537
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1538
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1539
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1540
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1541
|
-
}
|
1542
|
-
} break;
|
1543
|
-
case LLM_ARCH_COMMAND_R:
|
1544
|
-
{
|
1545
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1546
|
-
|
1547
|
-
// output
|
1548
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1549
|
-
// init output from the input tok embed
|
1550
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1551
|
-
|
1552
|
-
for (int i = 0; i < n_layer; ++i) {
|
1553
|
-
auto & layer = model.layers[i];
|
1554
|
-
|
1555
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1556
|
-
|
1557
|
-
if (n_layer >= 64){
|
1558
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
|
1559
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
|
1560
|
-
}
|
1561
|
-
|
1562
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1563
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1564
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1565
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1566
|
-
|
1567
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1568
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1569
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1570
|
-
}
|
1571
|
-
} break;
|
1572
|
-
case LLM_ARCH_COHERE2:
|
1573
|
-
{
|
1574
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
1575
|
-
|
1576
|
-
// output
|
1577
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
1578
|
-
// init output from the input tok embed
|
1579
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
|
1580
|
-
llama_model_loader::TENSOR_DUPLICATED);
|
1581
|
-
|
1582
|
-
for (int i = 0; i < n_layer; ++i) {
|
1583
|
-
auto & layer = model.layers[i];
|
1584
|
-
|
1585
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
1586
|
-
|
1587
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
|
1588
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
1589
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
1590
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
1591
|
-
|
1592
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
1593
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
1594
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
1595
|
-
}
|
1596
|
-
}
|
1597
|
-
break;
|
1598
|
-
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
1599
|
-
{
|
1600
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1601
|
-
|
1602
|
-
// output
|
1603
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1604
|
-
// if output is NULL, init from the input tok embed
|
1605
|
-
if (model.output == NULL) {
|
1606
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1607
|
-
}
|
1608
|
-
|
1609
|
-
for (int i = 0; i < n_layer; ++i) {
|
1610
|
-
auto & layer = model.layers[i];
|
1611
|
-
|
1612
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1613
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1614
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1615
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1616
|
-
|
1617
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1618
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1619
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1620
|
-
}
|
1621
|
-
} break;
|
1622
|
-
case LLM_ARCH_OLMO2:
|
1623
|
-
{
|
1624
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1625
|
-
|
1626
|
-
// output
|
1627
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1628
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1629
|
-
|
1630
|
-
for (int i = 0; i < n_layer; ++i) {
|
1631
|
-
auto & layer = model.layers[i];
|
1632
|
-
|
1633
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1634
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1635
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1636
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1637
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
1638
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
1639
|
-
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
1640
|
-
|
1641
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1642
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1643
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1644
|
-
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
1645
|
-
}
|
1646
|
-
} break;
|
1647
|
-
case LLM_ARCH_OLMOE:
|
1648
|
-
{
|
1649
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1650
|
-
|
1651
|
-
// output
|
1652
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1653
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1654
|
-
|
1655
|
-
for (int i = 0; i < n_layer; ++i) {
|
1656
|
-
auto & layer = model.layers[i];
|
1657
|
-
|
1658
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1659
|
-
|
1660
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1661
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1662
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1663
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1664
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
1665
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
1666
|
-
|
1667
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1668
|
-
|
1669
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
1670
|
-
|
1671
|
-
if (n_expert == 0) {
|
1672
|
-
throw std::runtime_error("n_expert must be > 0");
|
1673
|
-
}
|
1674
|
-
if (n_expert_used == 0) {
|
1675
|
-
throw std::runtime_error("n_expert_used must be > 0");
|
1676
|
-
}
|
1677
|
-
|
1678
|
-
// MoE branch
|
1679
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
1680
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
1681
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
1682
|
-
}
|
1683
|
-
} break;
|
1684
|
-
case LLM_ARCH_OPENELM:
|
1685
|
-
{
|
1686
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1687
|
-
|
1688
|
-
// output
|
1689
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1690
|
-
// init output from the input tok embed
|
1691
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1692
|
-
|
1693
|
-
for (int i = 0; i < n_layer; ++i) {
|
1694
|
-
const int64_t n_head = hparams.n_head(i);
|
1695
|
-
const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
|
1696
|
-
const int64_t n_ff = hparams.n_ff(i);
|
1697
|
-
|
1698
|
-
auto & layer = model.layers[i];
|
1699
|
-
|
1700
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1701
|
-
|
1702
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
|
1703
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
1704
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
1705
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
|
1706
|
-
|
1707
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1708
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1709
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
1710
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1711
|
-
}
|
1712
|
-
} break;
|
1713
|
-
case LLM_ARCH_GPTNEOX:
|
1714
|
-
{
|
1715
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1716
|
-
|
1717
|
-
// output
|
1718
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1719
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
1720
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1721
|
-
|
1722
|
-
for (int i = 0; i < n_layer; ++i) {
|
1723
|
-
auto & layer = model.layers[i];
|
1724
|
-
|
1725
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1726
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
1727
|
-
|
1728
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
1729
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
1730
|
-
|
1731
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1732
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
1733
|
-
|
1734
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1735
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
1736
|
-
|
1737
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
1738
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
1739
|
-
|
1740
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1741
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
1742
|
-
}
|
1743
|
-
} break;
|
1744
|
-
case LLM_ARCH_ARCTIC:
|
1745
|
-
{
|
1746
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1747
|
-
|
1748
|
-
// output
|
1749
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1750
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1751
|
-
|
1752
|
-
// if output is NULL, init from the input tok embed
|
1753
|
-
if (model.output == NULL) {
|
1754
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1755
|
-
}
|
1756
|
-
|
1757
|
-
for (int i = 0; i < n_layer; ++i) {
|
1758
|
-
auto & layer = model.layers[i];
|
1759
|
-
|
1760
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1761
|
-
|
1762
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1763
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1764
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1765
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1766
|
-
|
1767
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1768
|
-
|
1769
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
|
1770
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
|
1771
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
|
1772
|
-
|
1773
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
1774
|
-
layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
|
1775
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
1776
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
1777
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
1778
|
-
}
|
1779
|
-
} break;
|
1780
|
-
case LLM_ARCH_DEEPSEEK:
|
1781
|
-
{
|
1782
|
-
|
1783
|
-
const int64_t n_ff_exp = hparams.n_ff_exp;
|
1784
|
-
const int64_t n_expert_shared = hparams.n_expert_shared;
|
1785
|
-
|
1786
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1787
|
-
|
1788
|
-
// output
|
1789
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1790
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1791
|
-
|
1792
|
-
for (int i = 0; i < n_layer; ++i) {
|
1793
|
-
auto & layer = model.layers[i];
|
1794
|
-
|
1795
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1796
|
-
|
1797
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1798
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1799
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1800
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1801
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1802
|
-
|
1803
|
-
if (i < (int) hparams.n_layer_dense_lead) {
|
1804
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1805
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1806
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1807
|
-
} else {
|
1808
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
1809
|
-
|
1810
|
-
if (n_expert == 0) {
|
1811
|
-
throw std::runtime_error("n_expert must be > 0");
|
1812
|
-
}
|
1813
|
-
if (n_expert_used == 0) {
|
1814
|
-
throw std::runtime_error("n_expert_used must be > 0");
|
1815
|
-
}
|
1816
|
-
|
1817
|
-
// MoE branch
|
1818
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
1819
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
1820
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
1821
|
-
|
1822
|
-
// Shared expert branch
|
1823
|
-
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
1824
|
-
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
1825
|
-
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
1826
|
-
}
|
1827
|
-
}
|
1828
|
-
} break;
|
1829
|
-
case LLM_ARCH_DEEPSEEK2:
|
1830
|
-
{
|
1831
|
-
const bool is_lite = (hparams.n_layer == 27);
|
1832
|
-
|
1833
|
-
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
1834
|
-
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
1835
|
-
|
1836
|
-
const int64_t q_lora_rank = hparams.n_lora_q;
|
1837
|
-
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
1838
|
-
|
1839
|
-
const int64_t n_ff_exp = hparams.n_ff_exp;
|
1840
|
-
const int64_t n_expert_shared = hparams.n_expert_shared;
|
1841
|
-
|
1842
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1843
|
-
|
1844
|
-
// output
|
1845
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1846
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
1847
|
-
|
1848
|
-
for (int i = 0; i < n_layer; ++i) {
|
1849
|
-
auto & layer = model.layers[i];
|
1850
|
-
|
1851
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1852
|
-
if (!is_lite) {
|
1853
|
-
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
1854
|
-
}
|
1855
|
-
|
1856
|
-
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
1857
|
-
|
1858
|
-
if (!is_lite) {
|
1859
|
-
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
1860
|
-
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
|
1861
|
-
} else {
|
1862
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1863
|
-
}
|
1864
|
-
|
1865
|
-
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
1866
|
-
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
1867
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
1868
|
-
|
1869
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1870
|
-
|
1871
|
-
if (i < (int) hparams.n_layer_dense_lead) {
|
1872
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1873
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1874
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1875
|
-
} else {
|
1876
|
-
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
1877
|
-
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1878
|
-
|
1879
|
-
if (n_expert == 0) {
|
1880
|
-
throw std::runtime_error("n_expert must be > 0");
|
1881
|
-
}
|
1882
|
-
if (n_expert_used == 0) {
|
1883
|
-
throw std::runtime_error("n_expert_used must be > 0");
|
1884
|
-
}
|
1885
|
-
|
1886
|
-
// MoE branch
|
1887
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
1888
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
1889
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
1890
|
-
|
1891
|
-
// Shared expert branch
|
1892
|
-
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
1893
|
-
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
1894
|
-
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
1895
|
-
}
|
1896
|
-
}
|
1897
|
-
} break;
|
1898
|
-
case LLM_ARCH_BITNET:
|
1899
|
-
{
|
1900
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1901
|
-
|
1902
|
-
// output
|
1903
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1904
|
-
|
1905
|
-
for (int i = 0; i < n_layer; ++i) {
|
1906
|
-
auto & layer = model.layers[i];
|
1907
|
-
|
1908
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1909
|
-
layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
|
1910
|
-
|
1911
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
1912
|
-
layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1913
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1914
|
-
layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1915
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
1916
|
-
layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1917
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
1918
|
-
layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1919
|
-
|
1920
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
1921
|
-
layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
|
1922
|
-
|
1923
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
1924
|
-
layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1925
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
1926
|
-
layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1927
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1928
|
-
layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1929
|
-
}
|
1930
|
-
} break;
|
1931
|
-
case LLM_ARCH_T5:
|
1932
|
-
{
|
1933
|
-
const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
|
1934
|
-
|
1935
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1936
|
-
|
1937
|
-
// output
|
1938
|
-
model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1939
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1940
|
-
|
1941
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1942
|
-
// if output is NULL, init from the input tok embed
|
1943
|
-
if (model.output == NULL) {
|
1944
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1945
|
-
}
|
1946
|
-
|
1947
|
-
for (int i = 0; i < n_layer; ++i) {
|
1948
|
-
auto & layer = model.layers[i];
|
1949
|
-
|
1950
|
-
layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1951
|
-
layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1952
|
-
|
1953
|
-
layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1954
|
-
layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1955
|
-
layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
1956
|
-
layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
1957
|
-
|
1958
|
-
layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
|
1959
|
-
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1960
|
-
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1961
|
-
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1962
|
-
|
1963
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1964
|
-
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1965
|
-
|
1966
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1967
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1968
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
1969
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
1970
|
-
|
1971
|
-
layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
|
1972
|
-
// this tensor seems to be unused in HF transformers implementation
|
1973
|
-
layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1974
|
-
|
1975
|
-
layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1976
|
-
layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
1977
|
-
layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
1978
|
-
layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
1979
|
-
|
1980
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
|
1981
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1982
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
1983
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
1984
|
-
}
|
1985
|
-
} break;
|
1986
|
-
case LLM_ARCH_T5ENCODER:
|
1987
|
-
{
|
1988
|
-
const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
|
1989
|
-
|
1990
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
1991
|
-
|
1992
|
-
// output
|
1993
|
-
model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
1994
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
1995
|
-
// if output is NULL, init from the input tok embed
|
1996
|
-
if (model.output == NULL) {
|
1997
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
1998
|
-
}
|
1999
|
-
|
2000
|
-
for (int i = 0; i < n_layer; ++i) {
|
2001
|
-
auto & layer = model.layers[i];
|
2002
|
-
|
2003
|
-
layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
2004
|
-
layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2005
|
-
|
2006
|
-
layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
2007
|
-
layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
2008
|
-
layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
2009
|
-
layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
2010
|
-
|
2011
|
-
layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
|
2012
|
-
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2013
|
-
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
2014
|
-
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
2015
|
-
}
|
2016
|
-
} break;
|
2017
|
-
case LLM_ARCH_JAIS:
|
2018
|
-
{
|
2019
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
2020
|
-
|
2021
|
-
// output
|
2022
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
2023
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
2024
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
2025
|
-
|
2026
|
-
for (int i = 0; i < n_layer; ++i) {
|
2027
|
-
auto & layer = model.layers[i];
|
2028
|
-
|
2029
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
2030
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
2031
|
-
|
2032
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
2033
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
2034
|
-
|
2035
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
2036
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
2037
|
-
|
2038
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
2039
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
2040
|
-
|
2041
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
2042
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
2043
|
-
|
2044
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
2045
|
-
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
|
2046
|
-
|
2047
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
2048
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
2049
|
-
}
|
2050
|
-
} break;
|
2051
|
-
case LLM_ARCH_CHATGLM:
|
2052
|
-
{
|
2053
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
2054
|
-
|
2055
|
-
// output
|
2056
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
2057
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
2058
|
-
|
2059
|
-
for (int i = 0; i < n_layer; ++i) {
|
2060
|
-
auto & layer = model.layers[i];
|
2061
|
-
|
2062
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
2063
|
-
|
2064
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
2065
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
2066
|
-
|
2067
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
2068
|
-
|
2069
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
2070
|
-
|
2071
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
2072
|
-
|
2073
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
2074
|
-
}
|
2075
|
-
} break;
|
2076
|
-
case LLM_ARCH_NEMOTRON:
|
2077
|
-
{
|
2078
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
2079
|
-
|
2080
|
-
// output
|
2081
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
2082
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
2083
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
2084
|
-
|
2085
|
-
for (int i = 0; i < n_layer; ++i) {
|
2086
|
-
auto & layer = model.layers[i];
|
2087
|
-
|
2088
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
2089
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
2090
|
-
|
2091
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
2092
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
2093
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
2094
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
2095
|
-
|
2096
|
-
// optional bias tensors
|
2097
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2098
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2099
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2100
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2101
|
-
|
2102
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
2103
|
-
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
2104
|
-
|
2105
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
2106
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
2107
|
-
|
2108
|
-
// optional MLP bias
|
2109
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2110
|
-
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2111
|
-
}
|
2112
|
-
} break;
|
2113
|
-
case LLM_ARCH_EXAONE:
|
2114
|
-
{
|
2115
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
2116
|
-
|
2117
|
-
// output
|
2118
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
2119
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
2120
|
-
|
2121
|
-
for (int i = 0; i < n_layer; ++i) {
|
2122
|
-
auto & layer = model.layers[i];
|
2123
|
-
|
2124
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
2125
|
-
|
2126
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
2127
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
2128
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
2129
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
2130
|
-
|
2131
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
2132
|
-
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
2133
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
2134
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
2135
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
2136
|
-
}
|
2137
|
-
} break;
|
2138
|
-
case LLM_ARCH_RWKV6:
|
2139
|
-
{
|
2140
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
2141
|
-
|
2142
|
-
// Block 0, LN0
|
2143
|
-
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
2144
|
-
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
|
2145
|
-
|
2146
|
-
// output
|
2147
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
2148
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
2149
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
2150
|
-
|
2151
|
-
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
2152
|
-
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
2153
|
-
const int head_size = hparams.wkv_head_size;
|
2154
|
-
const int attn_hidden_size = n_embd;
|
2155
|
-
const int ffn_size = hparams.n_ff_arr[0];
|
2156
|
-
|
2157
|
-
for (int i = 0; i < n_layer; ++i) {
|
2158
|
-
auto & layer = model.layers[i];
|
2159
|
-
|
2160
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
2161
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
2162
|
-
|
2163
|
-
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
|
2164
|
-
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
|
2165
|
-
|
2166
|
-
layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
|
2167
|
-
layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
|
2168
|
-
|
2169
|
-
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
2170
|
-
layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, 0);
|
2171
|
-
layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
|
2172
|
-
layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, 0);
|
2173
|
-
layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
|
2174
|
-
layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, 0);
|
2175
|
-
|
2176
|
-
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
|
2177
|
-
layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
|
2178
|
-
layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
|
2179
|
-
layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
|
2180
|
-
layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
|
2181
|
-
layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
2182
|
-
layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
2183
|
-
layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
2184
|
-
|
2185
|
-
layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
|
2186
|
-
layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
|
2187
|
-
layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
2188
|
-
|
2189
|
-
layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
|
2190
|
-
layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
|
2191
|
-
|
2192
|
-
layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
|
2193
|
-
layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
|
2194
|
-
layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
|
2195
|
-
}
|
2196
|
-
|
2197
|
-
} break;
|
2198
|
-
case LLM_ARCH_CHAMELEON:
|
2199
|
-
{
|
2200
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
2201
|
-
|
2202
|
-
// output
|
2203
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
2204
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2205
|
-
// if output is NULL, init from the input tok embed
|
2206
|
-
if (model.output == NULL) {
|
2207
|
-
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
2208
|
-
}
|
2209
|
-
|
2210
|
-
for (int i = 0; i < n_layer; ++i) {
|
2211
|
-
auto & layer = model.layers[i];
|
2212
|
-
|
2213
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
2214
|
-
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
|
2215
|
-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
|
2216
|
-
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2217
|
-
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
2218
|
-
|
2219
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
2220
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
2221
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
2222
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
2223
|
-
|
2224
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
2225
|
-
|
2226
|
-
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
2227
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
2228
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
2229
|
-
}
|
2230
|
-
} break;
|
2231
|
-
case LLM_ARCH_WAVTOKENIZER_DEC:
|
2232
|
-
{
|
2233
|
-
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
|
2234
|
-
|
2235
|
-
model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
|
2236
|
-
model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
|
2237
|
-
|
2238
|
-
// posnet
|
2239
|
-
{
|
2240
|
-
const int64_t n_embd = hparams.posnet.n_embd;
|
2241
|
-
|
2242
|
-
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
|
2243
|
-
auto & layer = model.layers[i].posnet;
|
2244
|
-
|
2245
|
-
// posnet:
|
2246
|
-
//
|
2247
|
-
// - resnet
|
2248
|
-
// - resnet
|
2249
|
-
// - attn
|
2250
|
-
// - resnet
|
2251
|
-
// - resnet
|
2252
|
-
// - norm
|
2253
|
-
//
|
2254
|
-
switch (i) {
|
2255
|
-
case 0:
|
2256
|
-
case 1:
|
2257
|
-
case 3:
|
2258
|
-
case 4:
|
2259
|
-
{
|
2260
|
-
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
|
2261
|
-
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
|
2262
|
-
|
2263
|
-
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
|
2264
|
-
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
|
2265
|
-
|
2266
|
-
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
|
2267
|
-
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
|
2268
|
-
|
2269
|
-
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
|
2270
|
-
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
|
2271
|
-
} break;
|
2272
|
-
case 2:
|
2273
|
-
{
|
2274
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
2275
|
-
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
2276
|
-
|
2277
|
-
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
|
2278
|
-
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
|
2279
|
-
|
2280
|
-
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
|
2281
|
-
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
|
2282
|
-
|
2283
|
-
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
|
2284
|
-
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
|
2285
|
-
|
2286
|
-
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
|
2287
|
-
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
|
2288
|
-
} break;
|
2289
|
-
case 5:
|
2290
|
-
{
|
2291
|
-
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
2292
|
-
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
2293
|
-
} break;
|
2294
|
-
default: LM_GGML_ABORT("unknown posnet layer");
|
2295
|
-
};
|
2296
|
-
}
|
2297
|
-
}
|
2298
|
-
|
2299
|
-
LM_GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
|
2300
|
-
|
2301
|
-
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
|
2302
|
-
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
|
2303
|
-
|
2304
|
-
// convnext
|
2305
|
-
{
|
2306
|
-
const int64_t n_embd = hparams.convnext.n_embd;
|
2307
|
-
|
2308
|
-
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
|
2309
|
-
auto & layer = model.layers[i].convnext;
|
2310
|
-
|
2311
|
-
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
|
2312
|
-
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
|
2313
|
-
|
2314
|
-
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
|
2315
|
-
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
|
2316
|
-
|
2317
|
-
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
|
2318
|
-
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
|
2319
|
-
|
2320
|
-
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
|
2321
|
-
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
|
2322
|
-
|
2323
|
-
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
|
2324
|
-
}
|
2325
|
-
|
2326
|
-
// output
|
2327
|
-
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
2328
|
-
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
2329
|
-
}
|
2330
|
-
|
2331
|
-
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
2332
|
-
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
2333
|
-
} break;
|
2334
|
-
default:
|
2335
|
-
throw std::runtime_error("unknown architecture");
|
2336
|
-
}
|
2337
|
-
|
2338
|
-
if (n_moved_tensors > 0) {
|
2339
|
-
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
|
2340
|
-
__func__, first_moved_tensor->name, lm_ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
|
2341
|
-
lm_ggml_backend_buft_name(first_moved_from_buft), lm_ggml_backend_buft_name(first_moved_to_buft));
|
2342
|
-
}
|
2343
|
-
}
|
2344
|
-
|
2345
|
-
ml.done_getting_tensors();
|
2346
|
-
|
2347
|
-
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
2348
|
-
model.mappings.reserve(ml.mappings.size());
|
2349
|
-
|
2350
|
-
// create the backend buffers
|
2351
|
-
std::vector<std::pair<lm_ggml_context *, llama_buf_map>> ctx_bufs;
|
2352
|
-
ctx_bufs.reserve(ctx_map.size());
|
2353
|
-
|
2354
|
-
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
2355
|
-
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
2356
|
-
model.bufs.reserve(n_max_backend_buffer);
|
2357
|
-
|
2358
|
-
for (auto & it : ctx_map) {
|
2359
|
-
lm_ggml_backend_buffer_type_t buft = it.first;
|
2360
|
-
lm_ggml_context * ctx = it.second;
|
2361
|
-
|
2362
|
-
// skip contexts without tensors
|
2363
|
-
if (lm_ggml_get_first_tensor(ctx) == nullptr) {
|
2364
|
-
continue;
|
2365
|
-
}
|
2366
|
-
|
2367
|
-
llama_buf_map bufs;
|
2368
|
-
bufs.reserve(n_max_backend_buffer);
|
2369
11
|
|
2370
|
-
|
2371
|
-
|
2372
|
-
|
2373
|
-
|
2374
|
-
dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
2375
|
-
}
|
2376
|
-
lm_ggml_backend_dev_props props;
|
2377
|
-
lm_ggml_backend_dev_get_props(dev, &props);
|
2378
|
-
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
2379
|
-
bool is_default_buft = buft == lm_ggml_backend_dev_buffer_type(dev);
|
2380
|
-
|
2381
|
-
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
2382
|
-
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
2383
|
-
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
2384
|
-
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
2385
|
-
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
2386
|
-
void * addr = nullptr;
|
2387
|
-
size_t first, last; // NOLINT
|
2388
|
-
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
2389
|
-
if (first >= last) {
|
2390
|
-
continue;
|
2391
|
-
}
|
2392
|
-
const size_t max_size = lm_ggml_get_max_tensor_size(ctx);
|
2393
|
-
lm_ggml_backend_buffer_t buf = lm_ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
2394
|
-
if (buf == nullptr) {
|
2395
|
-
throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
|
2396
|
-
}
|
2397
|
-
model.bufs.emplace_back(buf);
|
2398
|
-
bufs.emplace(idx, buf);
|
2399
|
-
}
|
2400
|
-
}
|
2401
|
-
else {
|
2402
|
-
lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
2403
|
-
if (buf == nullptr) {
|
2404
|
-
throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
|
2405
|
-
}
|
2406
|
-
model.bufs.emplace_back(buf);
|
2407
|
-
if (use_mlock && lm_ggml_backend_buffer_is_host(buf)) {
|
2408
|
-
model.mlock_bufs.emplace_back(new llama_mlock);
|
2409
|
-
auto & mlock_buf = model.mlock_bufs.back();
|
2410
|
-
mlock_buf->init (lm_ggml_backend_buffer_get_base(buf));
|
2411
|
-
mlock_buf->grow_to(lm_ggml_backend_buffer_get_size(buf));
|
2412
|
-
}
|
2413
|
-
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
2414
|
-
bufs.emplace(idx, buf);
|
2415
|
-
}
|
2416
|
-
}
|
2417
|
-
|
2418
|
-
if (bufs.empty()) {
|
2419
|
-
throw std::runtime_error("failed to allocate buffer");
|
2420
|
-
}
|
2421
|
-
|
2422
|
-
for (auto & buf : bufs) {
|
2423
|
-
// indicate that this buffer contains weights
|
2424
|
-
// this is used by lm_ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
|
2425
|
-
lm_ggml_backend_buffer_set_usage(buf.second, LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
2426
|
-
}
|
2427
|
-
|
2428
|
-
ctx_bufs.emplace_back(ctx, bufs);
|
2429
|
-
}
|
2430
|
-
|
2431
|
-
if (llama_supports_gpu_offload()) {
|
2432
|
-
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
2433
|
-
|
2434
|
-
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
2435
|
-
if (n_gpu_layers > (int) hparams.n_layer) {
|
2436
|
-
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
|
2437
|
-
}
|
2438
|
-
|
2439
|
-
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2440
|
-
const int max_offloadable_layers = hparams.n_layer + 1;
|
2441
|
-
|
2442
|
-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2443
|
-
}
|
2444
|
-
|
2445
|
-
// print memory requirements per buffer type
|
2446
|
-
for (auto & buf : model.bufs) {
|
2447
|
-
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
2448
|
-
}
|
2449
|
-
|
2450
|
-
// populate tensors_by_name
|
2451
|
-
for (auto & ctx : model.ctxs) {
|
2452
|
-
for (auto * cur = lm_ggml_get_first_tensor(ctx.get()); cur != NULL; cur = lm_ggml_get_next_tensor(ctx.get(), cur)) {
|
2453
|
-
model.tensors_by_name.emplace_back(lm_ggml_get_name(cur), cur);
|
2454
|
-
}
|
2455
|
-
}
|
12
|
+
#include "ggml.h"
|
13
|
+
#include "ggml-alloc.h"
|
14
|
+
#include "ggml-backend.h"
|
15
|
+
#include "ggml-cpp.h"
|
2456
16
|
|
2457
|
-
|
2458
|
-
|
2459
|
-
|
2460
|
-
|
2461
|
-
|
2462
|
-
|
2463
|
-
|
2464
|
-
|
17
|
+
#include <algorithm>
|
18
|
+
#include <array>
|
19
|
+
#include <cassert>
|
20
|
+
#include <cfloat>
|
21
|
+
#include <cmath>
|
22
|
+
#include <cstddef>
|
23
|
+
#include <cstdint>
|
24
|
+
#include <cstdio>
|
25
|
+
#include <cstring>
|
26
|
+
#include <ctime>
|
27
|
+
#include <functional>
|
2465
28
|
|
2466
|
-
|
2467
|
-
|
2468
|
-
|
2469
|
-
}
|
2470
|
-
}
|
29
|
+
#if defined(_MSC_VER)
|
30
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
31
|
+
#endif
|
2471
32
|
|
2472
|
-
|
2473
|
-
|
33
|
+
#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
|
34
|
+
#include <android/log.h>
|
35
|
+
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
36
|
+
#undef LLAMA_LOG_INFO
|
37
|
+
#undef LLAMA_LOG_WARN
|
38
|
+
#undef LLAMA_LOG_ERROR
|
39
|
+
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
40
|
+
#define LLAMA_LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
41
|
+
#define LLAMA_LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, LLAMA_ANDROID_TAG, __VA_ARGS__)
|
42
|
+
#endif // __ANDROID__
|
2474
43
|
|
2475
44
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
2476
|
-
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
2477
|
-
|
45
|
+
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
46
|
+
// loading time will be recalculated after the first eval, so
|
47
|
+
// we take page faults deferred by mmap() into consideration
|
48
|
+
model.t_load_us = 0;
|
49
|
+
time_meas tm(model.t_load_us);
|
50
|
+
|
51
|
+
model.t_start_us = tm.t_start_us;
|
2478
52
|
|
2479
53
|
try {
|
2480
|
-
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
54
|
+
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
|
55
|
+
|
56
|
+
ml.print_info();
|
2481
57
|
|
2482
58
|
model.hparams.vocab_only = params.vocab_only;
|
2483
59
|
|
2484
60
|
try {
|
2485
|
-
|
61
|
+
model.load_arch(ml);
|
2486
62
|
} catch(const std::exception & e) {
|
2487
63
|
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
2488
64
|
}
|
2489
65
|
try {
|
2490
|
-
|
66
|
+
model.load_hparams(ml);
|
2491
67
|
} catch(const std::exception & e) {
|
2492
68
|
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
2493
69
|
}
|
2494
70
|
try {
|
2495
|
-
|
71
|
+
model.load_vocab(ml);
|
2496
72
|
} catch(const std::exception & e) {
|
2497
73
|
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
2498
74
|
}
|
2499
75
|
|
2500
|
-
|
2501
|
-
|
2502
|
-
|
2503
|
-
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
2504
|
-
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2505
|
-
throw std::runtime_error("vocab size mismatch");
|
2506
|
-
}
|
76
|
+
model.load_stats(ml);
|
77
|
+
model.print_info();
|
2507
78
|
|
2508
79
|
if (params.vocab_only) {
|
2509
80
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
2510
81
|
return 0;
|
2511
82
|
}
|
2512
83
|
|
2513
|
-
if (!
|
2514
|
-
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
2515
|
-
params.progress_callback, params.progress_callback_user_data
|
2516
|
-
)) {
|
84
|
+
if (!model.load_tensors(ml)) {
|
2517
85
|
return -2;
|
2518
86
|
}
|
2519
87
|
} catch (const std::exception & err) {
|
@@ -2521,10 +89,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
2521
89
|
return -1;
|
2522
90
|
}
|
2523
91
|
|
2524
|
-
// loading time will be recalculate after the first eval, so
|
2525
|
-
// we take page faults deferred by mmap() into consideration
|
2526
|
-
model.t_load_us = lm_ggml_time_us() - model.t_start_us;
|
2527
|
-
|
2528
92
|
return 0;
|
2529
93
|
}
|
2530
94
|
|
@@ -2572,16 +136,16 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
|
|
2572
136
|
inpL = lm_ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
2573
137
|
|
2574
138
|
// apply lora for embedding tokens if needed
|
2575
|
-
for (auto & it : lctx.
|
2576
|
-
struct
|
2577
|
-
if (
|
139
|
+
for (auto & it : lctx.lora) {
|
140
|
+
struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd);
|
141
|
+
if (lw == nullptr) {
|
2578
142
|
continue;
|
2579
143
|
}
|
2580
144
|
const float adapter_scale = it.second;
|
2581
|
-
const float scale =
|
145
|
+
const float scale = lw->get_scale(it.first->alpha, adapter_scale);
|
2582
146
|
struct lm_ggml_tensor * inpL_delta = lm_ggml_scale(ctx, lm_ggml_mul_mat(
|
2583
|
-
ctx,
|
2584
|
-
lm_ggml_get_rows(ctx,
|
147
|
+
ctx, lw->b, // non-transposed lora_b
|
148
|
+
lm_ggml_get_rows(ctx, lw->a, lctx.inp_tokens)
|
2585
149
|
), scale);
|
2586
150
|
inpL = lm_ggml_add(ctx, inpL, inpL_delta);
|
2587
151
|
}
|
@@ -2652,16 +216,16 @@ static struct lm_ggml_tensor * llm_build_lora_mm(
|
|
2652
216
|
struct lm_ggml_tensor * w,
|
2653
217
|
struct lm_ggml_tensor * cur) {
|
2654
218
|
struct lm_ggml_tensor * res = lm_ggml_mul_mat(ctx0, w, cur);
|
2655
|
-
for (auto & it : lctx.
|
2656
|
-
struct
|
2657
|
-
if (
|
219
|
+
for (auto & it : lctx.lora) {
|
220
|
+
struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
|
221
|
+
if (lw == nullptr) {
|
2658
222
|
continue;
|
2659
223
|
}
|
2660
224
|
const float adapter_scale = it.second;
|
2661
|
-
const float scale =
|
225
|
+
const float scale = lw->get_scale(it.first->alpha, adapter_scale);
|
2662
226
|
struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat(
|
2663
|
-
ctx0,
|
2664
|
-
lm_ggml_mul_mat(ctx0,
|
227
|
+
ctx0, lw->b,
|
228
|
+
lm_ggml_mul_mat(ctx0, lw->a, cur)
|
2665
229
|
);
|
2666
230
|
ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
|
2667
231
|
res = lm_ggml_add(ctx0, res, ab_cur);
|
@@ -2677,17 +241,17 @@ static struct lm_ggml_tensor * llm_build_lora_mm_id(
|
|
2677
241
|
struct lm_ggml_tensor * cur, // struct lm_ggml_tensor * b
|
2678
242
|
struct lm_ggml_tensor * ids) {
|
2679
243
|
struct lm_ggml_tensor * res = lm_ggml_mul_mat_id(ctx0, w, cur, ids);
|
2680
|
-
for (auto & it : lctx.
|
2681
|
-
struct
|
2682
|
-
if (
|
244
|
+
for (auto & it : lctx.lora) {
|
245
|
+
struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
|
246
|
+
if (lw == nullptr) {
|
2683
247
|
continue;
|
2684
248
|
}
|
2685
249
|
const float alpha = it.first->alpha;
|
2686
|
-
const float rank = (float)
|
250
|
+
const float rank = (float) lw->b->ne[0];
|
2687
251
|
const float scale = alpha ? it.second * alpha / rank : it.second;
|
2688
252
|
struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat_id(
|
2689
|
-
ctx0,
|
2690
|
-
lm_ggml_mul_mat_id(ctx0,
|
253
|
+
ctx0, lw->b,
|
254
|
+
lm_ggml_mul_mat_id(ctx0, lw->a, cur, ids),
|
2691
255
|
ids
|
2692
256
|
);
|
2693
257
|
ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
|
@@ -3318,16 +882,20 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
3318
882
|
const struct llama_layer * layer,
|
3319
883
|
struct lm_ggml_tensor * cur,
|
3320
884
|
struct lm_ggml_tensor * x_prev,
|
3321
|
-
struct lm_ggml_tensor ** wkv_state
|
885
|
+
struct lm_ggml_tensor ** wkv_state,
|
886
|
+
size_t wkv_head_size,
|
887
|
+
size_t head_count_kv) {
|
3322
888
|
size_t n_embd = cur->ne[0];
|
3323
889
|
size_t n_seq_tokens = cur->ne[1];
|
3324
890
|
size_t n_seqs = cur->ne[2];
|
3325
891
|
|
3326
|
-
size_t head_size =
|
3327
|
-
size_t head_count =
|
892
|
+
size_t head_size = wkv_head_size;
|
893
|
+
size_t head_count = n_embd / head_size;
|
3328
894
|
|
3329
895
|
size_t n_tokens = n_seqs * n_seq_tokens;
|
3330
896
|
|
897
|
+
bool is_qrwkv = layer->time_mix_first == nullptr;
|
898
|
+
|
3331
899
|
struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
|
3332
900
|
|
3333
901
|
sx = lm_ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
|
@@ -3356,69 +924,64 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
3356
924
|
xxx
|
3357
925
|
);
|
3358
926
|
|
3359
|
-
struct lm_ggml_tensor *
|
3360
|
-
|
3361
|
-
|
3362
|
-
|
3363
|
-
|
3364
|
-
|
3365
|
-
|
3366
|
-
ctx,
|
3367
|
-
|
3368
|
-
|
3369
|
-
|
3370
|
-
|
3371
|
-
|
3372
|
-
|
3373
|
-
|
927
|
+
struct lm_ggml_tensor *xw, *xk, *xv, *xr, *xg;
|
928
|
+
if (layer->time_mix_lerp_fused) {
|
929
|
+
// fusing these weights makes some performance improvement
|
930
|
+
sx = lm_ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens);
|
931
|
+
cur = lm_ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
932
|
+
xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur);
|
933
|
+
xw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
|
934
|
+
xk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
|
935
|
+
xv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
|
936
|
+
xr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
|
937
|
+
xg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
|
938
|
+
} else {
|
939
|
+
// for backward compatibility
|
940
|
+
xw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
|
941
|
+
xk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
|
942
|
+
xv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
|
943
|
+
xr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
|
944
|
+
xg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
|
3374
945
|
|
3375
|
-
|
3376
|
-
ctx,
|
3377
|
-
lm_ggml_mul(
|
3378
|
-
|
3379
|
-
|
3380
|
-
|
3381
|
-
),
|
3382
|
-
cur
|
3383
|
-
);
|
946
|
+
xw = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur);
|
947
|
+
xk = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur);
|
948
|
+
xv = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur);
|
949
|
+
xr = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur);
|
950
|
+
xg = lm_ggml_add(ctx, lm_ggml_mul(ctx, lm_ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur);
|
951
|
+
}
|
3384
952
|
|
3385
|
-
struct lm_ggml_tensor *
|
3386
|
-
|
3387
|
-
|
3388
|
-
|
3389
|
-
|
3390
|
-
|
3391
|
-
|
3392
|
-
|
3393
|
-
|
953
|
+
struct lm_ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr);
|
954
|
+
struct lm_ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk);
|
955
|
+
struct lm_ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv);
|
956
|
+
if (layer->time_mix_receptance_b) {
|
957
|
+
r = lm_ggml_add(ctx, r, layer->time_mix_receptance_b);
|
958
|
+
}
|
959
|
+
if (layer->time_mix_key_b) {
|
960
|
+
k = lm_ggml_add(ctx, k, layer->time_mix_key_b);
|
961
|
+
}
|
962
|
+
if (layer->time_mix_value_b) {
|
963
|
+
v = lm_ggml_add(ctx, v, layer->time_mix_value_b);
|
964
|
+
}
|
3394
965
|
|
3395
|
-
struct lm_ggml_tensor *
|
3396
|
-
|
3397
|
-
|
3398
|
-
|
3399
|
-
|
3400
|
-
|
3401
|
-
),
|
3402
|
-
cur
|
3403
|
-
);
|
966
|
+
struct lm_ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg);
|
967
|
+
if (is_qrwkv) {
|
968
|
+
g = lm_ggml_sigmoid(ctx, g);
|
969
|
+
} else {
|
970
|
+
g = lm_ggml_silu(ctx, g);
|
971
|
+
}
|
3404
972
|
|
3405
|
-
|
3406
|
-
|
3407
|
-
|
3408
|
-
|
3409
|
-
|
3410
|
-
|
3411
|
-
)
|
3412
|
-
|
3413
|
-
);
|
973
|
+
if (head_count_kv != head_count) {
|
974
|
+
LM_GGML_ASSERT(head_count % head_count_kv == 0);
|
975
|
+
k = lm_ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens);
|
976
|
+
v = lm_ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens);
|
977
|
+
struct lm_ggml_tensor * tmp = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
|
978
|
+
k = lm_ggml_repeat(ctx, k, tmp);
|
979
|
+
v = lm_ggml_repeat(ctx, v, tmp);
|
980
|
+
}
|
3414
981
|
|
3415
|
-
|
3416
|
-
|
3417
|
-
|
3418
|
-
struct lm_ggml_tensor * g = lm_ggml_silu(
|
3419
|
-
ctx,
|
3420
|
-
llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
|
3421
|
-
);
|
982
|
+
k = lm_ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens);
|
983
|
+
v = lm_ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens);
|
984
|
+
r = lm_ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens);
|
3422
985
|
|
3423
986
|
struct lm_ggml_tensor * w = lm_ggml_mul_mat(
|
3424
987
|
ctx,
|
@@ -3429,25 +992,35 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
3429
992
|
)
|
3430
993
|
);
|
3431
994
|
|
3432
|
-
w = lm_ggml_add(ctx, w,
|
995
|
+
w = lm_ggml_add(ctx, w, layer->time_mix_decay);
|
3433
996
|
w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
|
3434
|
-
w =
|
997
|
+
w = lm_ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens);
|
3435
998
|
|
3436
|
-
|
3437
|
-
|
3438
|
-
|
999
|
+
if (is_qrwkv) {
|
1000
|
+
// k = k * (1 - w)
|
1001
|
+
k = lm_ggml_sub(ctx, k, lm_ggml_mul(ctx, k, w));
|
1002
|
+
}
|
3439
1003
|
|
3440
|
-
struct lm_ggml_tensor * wkv_output
|
1004
|
+
struct lm_ggml_tensor * wkv_output;
|
1005
|
+
if (!layer->time_mix_first) {
|
1006
|
+
wkv_output = lm_ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
|
1007
|
+
} else {
|
1008
|
+
wkv_output = lm_ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
1009
|
+
}
|
3441
1010
|
cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
|
3442
1011
|
*wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
|
3443
1012
|
|
3444
|
-
|
3445
|
-
|
3446
|
-
|
1013
|
+
if (!is_qrwkv) {
|
1014
|
+
// group norm with head_count groups
|
1015
|
+
cur = lm_ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
|
1016
|
+
cur = lm_ggml_norm(ctx, cur, 64e-5f);
|
3447
1017
|
|
3448
|
-
|
3449
|
-
|
3450
|
-
|
1018
|
+
// Convert back to regular vectors.
|
1019
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
1020
|
+
cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
1021
|
+
} else {
|
1022
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
1023
|
+
}
|
3451
1024
|
|
3452
1025
|
cur = lm_ggml_mul(ctx, cur, g);
|
3453
1026
|
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
|
@@ -3603,7 +1176,7 @@ struct llm_build_context {
|
|
3603
1176
|
}
|
3604
1177
|
|
3605
1178
|
struct lm_ggml_cgraph * build_k_shift() {
|
3606
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
1179
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
3607
1180
|
|
3608
1181
|
LM_GGML_ASSERT(kv_self.size == n_ctx);
|
3609
1182
|
|
@@ -3653,7 +1226,7 @@ struct llm_build_context {
|
|
3653
1226
|
}
|
3654
1227
|
|
3655
1228
|
struct lm_ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
3656
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
1229
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
3657
1230
|
|
3658
1231
|
for (uint32_t i = 0; i < ids.size(); ++i) {
|
3659
1232
|
const uint32_t id = ids[i];
|
@@ -3912,7 +1485,7 @@ struct llm_build_context {
|
|
3912
1485
|
}
|
3913
1486
|
|
3914
1487
|
struct lm_ggml_cgraph * build_llama() {
|
3915
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
1488
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
3916
1489
|
|
3917
1490
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
3918
1491
|
int32_t n_tokens = this->n_tokens;
|
@@ -4078,7 +1651,7 @@ struct llm_build_context {
|
|
4078
1651
|
}
|
4079
1652
|
|
4080
1653
|
struct lm_ggml_cgraph * build_deci() {
|
4081
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
1654
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4082
1655
|
|
4083
1656
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
4084
1657
|
int32_t n_tokens = this->n_tokens;
|
@@ -4239,7 +1812,7 @@ struct llm_build_context {
|
|
4239
1812
|
}
|
4240
1813
|
|
4241
1814
|
struct lm_ggml_cgraph * build_baichuan() {
|
4242
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
1815
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4243
1816
|
|
4244
1817
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4245
1818
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -4251,7 +1824,7 @@ struct llm_build_context {
|
|
4251
1824
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
4252
1825
|
|
4253
1826
|
// inp_pos - contains the positions
|
4254
|
-
struct lm_ggml_tensor * inp_pos = model.type ==
|
1827
|
+
struct lm_ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
4255
1828
|
|
4256
1829
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4257
1830
|
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
@@ -4276,7 +1849,7 @@ struct llm_build_context {
|
|
4276
1849
|
cb(Vcur, "Vcur", il);
|
4277
1850
|
|
4278
1851
|
switch (model.type) {
|
4279
|
-
case
|
1852
|
+
case LLM_TYPE_7B:
|
4280
1853
|
Qcur = lm_ggml_rope_ext(
|
4281
1854
|
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
4282
1855
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
@@ -4288,7 +1861,7 @@ struct llm_build_context {
|
|
4288
1861
|
ext_factor, attn_factor, beta_fast, beta_slow
|
4289
1862
|
);
|
4290
1863
|
break;
|
4291
|
-
case
|
1864
|
+
case LLM_TYPE_13B:
|
4292
1865
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
|
4293
1866
|
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
|
4294
1867
|
break;
|
@@ -4354,7 +1927,7 @@ struct llm_build_context {
|
|
4354
1927
|
}
|
4355
1928
|
|
4356
1929
|
struct lm_ggml_cgraph * build_xverse() {
|
4357
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
1930
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4358
1931
|
|
4359
1932
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4360
1933
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -4457,7 +2030,7 @@ struct llm_build_context {
|
|
4457
2030
|
}
|
4458
2031
|
|
4459
2032
|
struct lm_ggml_cgraph * build_falcon() {
|
4460
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2033
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4461
2034
|
|
4462
2035
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4463
2036
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -4577,7 +2150,7 @@ struct llm_build_context {
|
|
4577
2150
|
}
|
4578
2151
|
|
4579
2152
|
struct lm_ggml_cgraph * build_grok() {
|
4580
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2153
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4581
2154
|
|
4582
2155
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
4583
2156
|
int32_t n_tokens = this->n_tokens;
|
@@ -4736,7 +2309,7 @@ struct llm_build_context {
|
|
4736
2309
|
}
|
4737
2310
|
|
4738
2311
|
struct lm_ggml_cgraph * build_dbrx() {
|
4739
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2312
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4740
2313
|
|
4741
2314
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
4742
2315
|
int32_t n_tokens = this->n_tokens;
|
@@ -4864,7 +2437,7 @@ struct llm_build_context {
|
|
4864
2437
|
}
|
4865
2438
|
|
4866
2439
|
struct lm_ggml_cgraph * build_starcoder() {
|
4867
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2440
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4868
2441
|
|
4869
2442
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4870
2443
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -4968,7 +2541,7 @@ struct llm_build_context {
|
|
4968
2541
|
}
|
4969
2542
|
|
4970
2543
|
struct lm_ggml_cgraph * build_refact() {
|
4971
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2544
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
4972
2545
|
|
4973
2546
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4974
2547
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -5062,7 +2635,7 @@ struct llm_build_context {
|
|
5062
2635
|
}
|
5063
2636
|
|
5064
2637
|
struct lm_ggml_cgraph * build_bert() {
|
5065
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2638
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
5066
2639
|
|
5067
2640
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5068
2641
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -5256,7 +2829,7 @@ struct llm_build_context {
|
|
5256
2829
|
}
|
5257
2830
|
|
5258
2831
|
struct lm_ggml_cgraph * build_bloom() {
|
5259
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2832
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
5260
2833
|
|
5261
2834
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5262
2835
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -5357,7 +2930,7 @@ struct llm_build_context {
|
|
5357
2930
|
}
|
5358
2931
|
|
5359
2932
|
struct lm_ggml_cgraph * build_mpt() {
|
5360
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
2933
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
5361
2934
|
|
5362
2935
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5363
2936
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -5647,7 +3220,7 @@ struct llm_build_context {
|
|
5647
3220
|
}
|
5648
3221
|
|
5649
3222
|
struct lm_ggml_cgraph * build_qwen() {
|
5650
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
3223
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
5651
3224
|
|
5652
3225
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5653
3226
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -5759,7 +3332,7 @@ struct llm_build_context {
|
|
5759
3332
|
}
|
5760
3333
|
|
5761
3334
|
struct lm_ggml_cgraph * build_qwen2() {
|
5762
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
3335
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
5763
3336
|
|
5764
3337
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5765
3338
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -5871,7 +3444,7 @@ struct llm_build_context {
|
|
5871
3444
|
}
|
5872
3445
|
|
5873
3446
|
struct lm_ggml_cgraph * build_qwen2vl() {
|
5874
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
3447
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
5875
3448
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5876
3449
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5877
3450
|
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
@@ -5989,7 +3562,7 @@ struct llm_build_context {
|
|
5989
3562
|
}
|
5990
3563
|
|
5991
3564
|
struct lm_ggml_cgraph * build_qwen2moe() {
|
5992
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
3565
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
5993
3566
|
|
5994
3567
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
5995
3568
|
int32_t n_tokens = this->n_tokens;
|
@@ -6137,7 +3710,7 @@ struct llm_build_context {
|
|
6137
3710
|
}
|
6138
3711
|
|
6139
3712
|
struct lm_ggml_cgraph * build_phi2() {
|
6140
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
3713
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
6141
3714
|
|
6142
3715
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6143
3716
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -6258,7 +3831,7 @@ struct llm_build_context {
|
|
6258
3831
|
}
|
6259
3832
|
|
6260
3833
|
struct lm_ggml_cgraph * build_phi3() {
|
6261
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
3834
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
6262
3835
|
|
6263
3836
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6264
3837
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -6291,7 +3864,7 @@ struct llm_build_context {
|
|
6291
3864
|
|
6292
3865
|
struct lm_ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
6293
3866
|
model.layers[il].attn_norm,
|
6294
|
-
|
3867
|
+
model.layers[il].attn_norm_b,
|
6295
3868
|
LLM_NORM_RMS, cb, il);
|
6296
3869
|
cb(attn_norm_output, "attn_norm", il);
|
6297
3870
|
|
@@ -6306,8 +3879,7 @@ struct llm_build_context {
|
|
6306
3879
|
Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
6307
3880
|
Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
6308
3881
|
Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
6309
|
-
}
|
6310
|
-
else {
|
3882
|
+
} else {
|
6311
3883
|
Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
6312
3884
|
Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
6313
3885
|
Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
@@ -6351,14 +3923,12 @@ struct llm_build_context {
|
|
6351
3923
|
residual = cur;
|
6352
3924
|
|
6353
3925
|
cur = llm_build_norm(ctx0, cur, hparams,
|
6354
|
-
model.layers[il].ffn_norm,
|
3926
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
6355
3927
|
LLM_NORM_RMS, cb, il);
|
6356
3928
|
cb(cur, "ffn_norm", il);
|
6357
3929
|
|
6358
|
-
//
|
6359
|
-
|
6360
|
-
// TOOD: support into llm_build_ffn
|
6361
|
-
{
|
3930
|
+
// feed-forward network
|
3931
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
6362
3932
|
cur = llm_build_ffn(ctx0, lctx, cur,
|
6363
3933
|
model.layers[il].ffn_up, NULL, NULL,
|
6364
3934
|
NULL, NULL, NULL,
|
@@ -6366,6 +3936,20 @@ struct llm_build_context {
|
|
6366
3936
|
NULL,
|
6367
3937
|
LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
|
6368
3938
|
cb(cur, "ffn_out", il);
|
3939
|
+
} else {
|
3940
|
+
// MoE branch
|
3941
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
3942
|
+
model.layers[il].ffn_gate_inp,
|
3943
|
+
model.layers[il].ffn_up_exps,
|
3944
|
+
model.layers[il].ffn_gate_exps,
|
3945
|
+
model.layers[il].ffn_down_exps,
|
3946
|
+
nullptr,
|
3947
|
+
n_expert, n_expert_used,
|
3948
|
+
LLM_FFN_SILU, true,
|
3949
|
+
false, 0.0,
|
3950
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
3951
|
+
cb, il);
|
3952
|
+
cb(cur, "ffn_moe_out", il);
|
6369
3953
|
}
|
6370
3954
|
|
6371
3955
|
cur = lm_ggml_add(ctx0, residual, cur);
|
@@ -6378,11 +3962,16 @@ struct llm_build_context {
|
|
6378
3962
|
|
6379
3963
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
6380
3964
|
model.output_norm,
|
6381
|
-
|
3965
|
+
model.output_norm_b,
|
6382
3966
|
LLM_NORM_RMS, cb, -1);
|
6383
3967
|
cb(cur, "result_norm", -1);
|
6384
3968
|
|
6385
3969
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
3970
|
+
|
3971
|
+
if (model.output_b != nullptr) {
|
3972
|
+
cb(cur, "result_output_no_bias", -1);
|
3973
|
+
cur = lm_ggml_add(ctx0, cur, model.output_b);
|
3974
|
+
}
|
6386
3975
|
cb(cur, "result_output", -1);
|
6387
3976
|
|
6388
3977
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -6496,7 +4085,7 @@ struct llm_build_context {
|
|
6496
4085
|
}
|
6497
4086
|
|
6498
4087
|
struct lm_ggml_cgraph * build_gpt2() {
|
6499
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4088
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
6500
4089
|
|
6501
4090
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6502
4091
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -6601,7 +4190,7 @@ struct llm_build_context {
|
|
6601
4190
|
}
|
6602
4191
|
|
6603
4192
|
struct lm_ggml_cgraph * build_codeshell() {
|
6604
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4193
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
6605
4194
|
|
6606
4195
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6607
4196
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -6712,7 +4301,7 @@ struct llm_build_context {
|
|
6712
4301
|
}
|
6713
4302
|
|
6714
4303
|
struct lm_ggml_cgraph * build_orion() {
|
6715
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4304
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
6716
4305
|
|
6717
4306
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6718
4307
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -6830,7 +4419,7 @@ struct llm_build_context {
|
|
6830
4419
|
}
|
6831
4420
|
|
6832
4421
|
struct lm_ggml_cgraph * build_internlm2() {
|
6833
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4422
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
6834
4423
|
|
6835
4424
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6836
4425
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -6948,7 +4537,7 @@ struct llm_build_context {
|
|
6948
4537
|
}
|
6949
4538
|
|
6950
4539
|
struct lm_ggml_cgraph * build_minicpm3() {
|
6951
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4540
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
6952
4541
|
|
6953
4542
|
//TODO: if the model varies, these parameters need to be read from the model
|
6954
4543
|
const int64_t n_embd_base = 256;
|
@@ -7064,7 +4653,7 @@ struct llm_build_context {
|
|
7064
4653
|
0);
|
7065
4654
|
cb(v_states, "v_states", il);
|
7066
4655
|
|
7067
|
-
q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend
|
4656
|
+
q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
7068
4657
|
q_pe = lm_ggml_rope_ext(
|
7069
4658
|
ctx0, q_pe, inp_pos, rope_factors,
|
7070
4659
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
@@ -7073,7 +4662,7 @@ struct llm_build_context {
|
|
7073
4662
|
cb(q_pe, "q_pe", il);
|
7074
4663
|
|
7075
4664
|
// shared RoPE key
|
7076
|
-
k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend
|
4665
|
+
k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
7077
4666
|
k_pe = lm_ggml_rope_ext(
|
7078
4667
|
ctx0, k_pe, inp_pos, rope_factors,
|
7079
4668
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
@@ -7157,7 +4746,7 @@ struct llm_build_context {
|
|
7157
4746
|
}
|
7158
4747
|
|
7159
4748
|
struct lm_ggml_cgraph * build_gemma() {
|
7160
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4749
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7161
4750
|
|
7162
4751
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
7163
4752
|
|
@@ -7265,7 +4854,7 @@ struct llm_build_context {
|
|
7265
4854
|
}
|
7266
4855
|
|
7267
4856
|
struct lm_ggml_cgraph * build_gemma2() {
|
7268
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4857
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7269
4858
|
|
7270
4859
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
7271
4860
|
|
@@ -7315,9 +4904,9 @@ struct llm_build_context {
|
|
7315
4904
|
|
7316
4905
|
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
7317
4906
|
switch (model.type) {
|
7318
|
-
case
|
7319
|
-
case
|
7320
|
-
case
|
4907
|
+
case LLM_TYPE_2B:
|
4908
|
+
case LLM_TYPE_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
4909
|
+
case LLM_TYPE_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
7321
4910
|
default: LM_GGML_ABORT("fatal error");
|
7322
4911
|
};
|
7323
4912
|
cb(Qcur, "Qcur_scaled", il);
|
@@ -7401,7 +4990,7 @@ struct llm_build_context {
|
|
7401
4990
|
|
7402
4991
|
|
7403
4992
|
struct lm_ggml_cgraph * build_starcoder2() {
|
7404
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
4993
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7405
4994
|
|
7406
4995
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7407
4996
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -7520,7 +5109,7 @@ struct llm_build_context {
|
|
7520
5109
|
}
|
7521
5110
|
|
7522
5111
|
struct lm_ggml_cgraph * build_mamba() {
|
7523
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5112
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7524
5113
|
|
7525
5114
|
struct lm_ggml_tensor * cur;
|
7526
5115
|
struct lm_ggml_tensor * inpL;
|
@@ -7575,7 +5164,7 @@ struct llm_build_context {
|
|
7575
5164
|
|
7576
5165
|
struct lm_ggml_cgraph * build_command_r() {
|
7577
5166
|
|
7578
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5167
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7579
5168
|
|
7580
5169
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7581
5170
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -7723,7 +5312,7 @@ struct llm_build_context {
|
|
7723
5312
|
}
|
7724
5313
|
|
7725
5314
|
struct lm_ggml_cgraph * build_cohere2() {
|
7726
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5315
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7727
5316
|
|
7728
5317
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7729
5318
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -7860,7 +5449,7 @@ struct llm_build_context {
|
|
7860
5449
|
// * removed bias
|
7861
5450
|
// * removed MoE
|
7862
5451
|
struct lm_ggml_cgraph * build_olmo() {
|
7863
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5452
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7864
5453
|
|
7865
5454
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7866
5455
|
int32_t n_tokens = this->n_tokens;
|
@@ -7984,7 +5573,7 @@ struct llm_build_context {
|
|
7984
5573
|
}
|
7985
5574
|
|
7986
5575
|
struct lm_ggml_cgraph * build_olmo2() {
|
7987
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5576
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7988
5577
|
|
7989
5578
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7990
5579
|
int32_t n_tokens = this->n_tokens;
|
@@ -8112,7 +5701,7 @@ struct llm_build_context {
|
|
8112
5701
|
// * removed bias
|
8113
5702
|
// * added q, k norm
|
8114
5703
|
struct lm_ggml_cgraph * build_olmoe() {
|
8115
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5704
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
8116
5705
|
|
8117
5706
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8118
5707
|
int32_t n_tokens = this->n_tokens;
|
@@ -8238,7 +5827,7 @@ struct llm_build_context {
|
|
8238
5827
|
}
|
8239
5828
|
|
8240
5829
|
struct lm_ggml_cgraph * build_openelm() {
|
8241
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5830
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
8242
5831
|
|
8243
5832
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8244
5833
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -8363,7 +5952,7 @@ struct llm_build_context {
|
|
8363
5952
|
}
|
8364
5953
|
|
8365
5954
|
struct lm_ggml_cgraph * build_gptneox() {
|
8366
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
5955
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
8367
5956
|
|
8368
5957
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8369
5958
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -8505,7 +6094,7 @@ struct llm_build_context {
|
|
8505
6094
|
}
|
8506
6095
|
|
8507
6096
|
struct lm_ggml_cgraph * build_arctic() {
|
8508
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
6097
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
8509
6098
|
|
8510
6099
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8511
6100
|
int32_t n_tokens = this->n_tokens;
|
@@ -8639,7 +6228,7 @@ struct llm_build_context {
|
|
8639
6228
|
}
|
8640
6229
|
|
8641
6230
|
struct lm_ggml_cgraph * build_deepseek() {
|
8642
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
6231
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
8643
6232
|
|
8644
6233
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8645
6234
|
int32_t n_tokens = this->n_tokens;
|
@@ -8796,7 +6385,7 @@ struct llm_build_context {
|
|
8796
6385
|
}
|
8797
6386
|
|
8798
6387
|
struct lm_ggml_cgraph * build_deepseek2() {
|
8799
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
6388
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
8800
6389
|
|
8801
6390
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8802
6391
|
int32_t n_tokens = this->n_tokens;
|
@@ -8918,7 +6507,7 @@ struct llm_build_context {
|
|
8918
6507
|
0);
|
8919
6508
|
cb(v_states, "v_states", il);
|
8920
6509
|
|
8921
|
-
q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend
|
6510
|
+
q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
8922
6511
|
q_pe = lm_ggml_rope_ext(
|
8923
6512
|
ctx0, q_pe, inp_pos, nullptr,
|
8924
6513
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
@@ -8927,7 +6516,7 @@ struct llm_build_context {
|
|
8927
6516
|
cb(q_pe, "q_pe", il);
|
8928
6517
|
|
8929
6518
|
// shared RoPE key
|
8930
|
-
k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend
|
6519
|
+
k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
8931
6520
|
k_pe = lm_ggml_rope_ext(
|
8932
6521
|
ctx0, k_pe, inp_pos, nullptr,
|
8933
6522
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
@@ -9026,7 +6615,7 @@ struct llm_build_context {
|
|
9026
6615
|
}
|
9027
6616
|
|
9028
6617
|
struct lm_ggml_cgraph * build_bitnet() {
|
9029
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
6618
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9030
6619
|
|
9031
6620
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9032
6621
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -9177,7 +6766,7 @@ struct llm_build_context {
|
|
9177
6766
|
}
|
9178
6767
|
|
9179
6768
|
struct lm_ggml_cgraph * build_t5_enc() {
|
9180
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
6769
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9181
6770
|
|
9182
6771
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
9183
6772
|
int32_t n_tokens = this->n_tokens;
|
@@ -9309,7 +6898,7 @@ struct llm_build_context {
|
|
9309
6898
|
}
|
9310
6899
|
|
9311
6900
|
struct lm_ggml_cgraph * build_t5_dec() {
|
9312
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
6901
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9313
6902
|
|
9314
6903
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
9315
6904
|
int32_t n_tokens = this->n_tokens;
|
@@ -9514,7 +7103,7 @@ struct llm_build_context {
|
|
9514
7103
|
}
|
9515
7104
|
|
9516
7105
|
struct lm_ggml_cgraph * build_jais() {
|
9517
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
7106
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9518
7107
|
|
9519
7108
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9520
7109
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -9606,7 +7195,7 @@ struct llm_build_context {
|
|
9606
7195
|
}
|
9607
7196
|
|
9608
7197
|
struct lm_ggml_cgraph * build_chatglm() {
|
9609
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
7198
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9610
7199
|
|
9611
7200
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9612
7201
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
@@ -9720,7 +7309,7 @@ struct llm_build_context {
|
|
9720
7309
|
}
|
9721
7310
|
|
9722
7311
|
struct lm_ggml_cgraph * build_nemotron() {
|
9723
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
7312
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9724
7313
|
|
9725
7314
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9726
7315
|
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
@@ -9841,7 +7430,7 @@ struct llm_build_context {
|
|
9841
7430
|
}
|
9842
7431
|
|
9843
7432
|
struct lm_ggml_cgraph * build_exaone() {
|
9844
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
7433
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9845
7434
|
|
9846
7435
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
9847
7436
|
int32_t n_tokens = this->n_tokens;
|
@@ -9968,7 +7557,7 @@ struct llm_build_context {
|
|
9968
7557
|
}
|
9969
7558
|
|
9970
7559
|
lm_ggml_cgraph * build_rwkv6() {
|
9971
|
-
lm_ggml_cgraph *gf = lm_ggml_new_graph_custom(ctx0,
|
7560
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
9972
7561
|
|
9973
7562
|
// Token shift state dimensions should be 2 * n_emb
|
9974
7563
|
LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
|
@@ -10013,7 +7602,7 @@ struct llm_build_context {
|
|
10013
7602
|
1
|
10014
7603
|
);
|
10015
7604
|
|
10016
|
-
cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
|
7605
|
+
cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
|
10017
7606
|
lm_ggml_build_forward_expand(gf, cur);
|
10018
7607
|
lm_ggml_build_forward_expand(
|
10019
7608
|
gf,
|
@@ -10080,6 +7669,118 @@ struct llm_build_context {
|
|
10080
7669
|
return gf;
|
10081
7670
|
}
|
10082
7671
|
|
7672
|
+
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
|
7673
|
+
lm_ggml_cgraph * build_rwkv6qwen2() {
|
7674
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
7675
|
+
|
7676
|
+
LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s());
|
7677
|
+
|
7678
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
7679
|
+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
7680
|
+
const int64_t n_tokens = ubatch.n_tokens;
|
7681
|
+
LM_GGML_ASSERT(n_seqs != 0);
|
7682
|
+
LM_GGML_ASSERT(ubatch.equal_seqs);
|
7683
|
+
LM_GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
7684
|
+
|
7685
|
+
struct lm_ggml_tensor * cur;
|
7686
|
+
struct lm_ggml_tensor * inpL;
|
7687
|
+
struct lm_ggml_tensor * state_copy = build_inp_s_copy();
|
7688
|
+
struct lm_ggml_tensor * state_mask = build_inp_s_mask();
|
7689
|
+
|
7690
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
7691
|
+
|
7692
|
+
for (int il = 0; il < n_layer; ++il) {
|
7693
|
+
const llama_layer * layer = &model.layers[il];
|
7694
|
+
|
7695
|
+
// (ab)using the KV cache to store the states
|
7696
|
+
struct lm_ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
|
7697
|
+
gf, kv_self.k_l[il], state_copy, state_mask,
|
7698
|
+
hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
7699
|
+
struct lm_ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
|
7700
|
+
gf, kv_self.v_l[il], state_copy, state_mask,
|
7701
|
+
hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
7702
|
+
|
7703
|
+
cur = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
7704
|
+
token_shift = lm_ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
|
7705
|
+
|
7706
|
+
struct lm_ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il);
|
7707
|
+
struct lm_ggml_tensor * x_prev = lm_ggml_concat(
|
7708
|
+
ctx0,
|
7709
|
+
token_shift,
|
7710
|
+
lm_ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
|
7711
|
+
1
|
7712
|
+
);
|
7713
|
+
|
7714
|
+
lm_ggml_build_forward_expand(
|
7715
|
+
gf,
|
7716
|
+
lm_ggml_cpy(
|
7717
|
+
ctx0,
|
7718
|
+
wkv_states,
|
7719
|
+
lm_ggml_view_1d(
|
7720
|
+
ctx0,
|
7721
|
+
kv_self.v_l[il],
|
7722
|
+
hparams.n_embd_v_s() * n_seqs,
|
7723
|
+
hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
|
7724
|
+
)
|
7725
|
+
)
|
7726
|
+
);
|
7727
|
+
|
7728
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
|
7729
|
+
lm_ggml_build_forward_expand(gf, ffn_inp);
|
7730
|
+
lm_ggml_build_forward_expand(
|
7731
|
+
gf,
|
7732
|
+
lm_ggml_cpy(
|
7733
|
+
ctx0,
|
7734
|
+
wkv_states,
|
7735
|
+
lm_ggml_view_1d(
|
7736
|
+
ctx0,
|
7737
|
+
kv_self.v_l[il],
|
7738
|
+
hparams.n_embd_v_s() * n_seqs,
|
7739
|
+
hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
|
7740
|
+
)
|
7741
|
+
)
|
7742
|
+
);
|
7743
|
+
|
7744
|
+
cb(ffn_inp, "ffn_inp", il);
|
7745
|
+
|
7746
|
+
// feed-forward network
|
7747
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7748
|
+
model.layers[il].ffn_norm, NULL,
|
7749
|
+
LLM_NORM_RMS, cb, il);
|
7750
|
+
cb(cur, "ffn_norm", il);
|
7751
|
+
|
7752
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
7753
|
+
model.layers[il].ffn_up, NULL, NULL,
|
7754
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
7755
|
+
model.layers[il].ffn_down, NULL, NULL,
|
7756
|
+
NULL,
|
7757
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
7758
|
+
cb(cur, "ffn_out", il);
|
7759
|
+
|
7760
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
7761
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
7762
|
+
cb(cur, "l_out", il);
|
7763
|
+
|
7764
|
+
// input for next layer
|
7765
|
+
inpL = cur;
|
7766
|
+
}
|
7767
|
+
|
7768
|
+
cur = inpL;
|
7769
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7770
|
+
cur = lm_ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
7771
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
7772
|
+
|
7773
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1);
|
7774
|
+
cb(cur, "result_norm", -1);
|
7775
|
+
|
7776
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
7777
|
+
cb(cur, "result_output", -1);
|
7778
|
+
|
7779
|
+
lm_ggml_build_forward_expand(gf, cur);
|
7780
|
+
|
7781
|
+
return gf;
|
7782
|
+
}
|
7783
|
+
|
10083
7784
|
// ref: https://github.com/facebookresearch/chameleon
|
10084
7785
|
// based on the original build_llama() function, changes:
|
10085
7786
|
// * qk-norm
|
@@ -10087,7 +7788,7 @@ struct llm_build_context {
|
|
10087
7788
|
// * removed bias
|
10088
7789
|
// * removed MoE
|
10089
7790
|
struct lm_ggml_cgraph * build_chameleon() {
|
10090
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
7791
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
10091
7792
|
|
10092
7793
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10093
7794
|
int32_t n_tokens = this->n_tokens;
|
@@ -10259,7 +7960,7 @@ struct llm_build_context {
|
|
10259
7960
|
}
|
10260
7961
|
|
10261
7962
|
struct lm_ggml_cgraph * build_wavtokenizer_dec() {
|
10262
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0,
|
7963
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
10263
7964
|
|
10264
7965
|
struct lm_ggml_tensor * cur;
|
10265
7966
|
struct lm_ggml_tensor * inpL;
|
@@ -10468,12 +8169,12 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
10468
8169
|
|
10469
8170
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
10470
8171
|
// FIXME: fix in lm_ggml_backend_sched
|
10471
|
-
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
8172
|
+
const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer;
|
10472
8173
|
if (ubatch.n_tokens < 32 || full_offload) {
|
10473
8174
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
10474
|
-
const auto & dev_layer = lctx.model.dev_layer
|
8175
|
+
const auto & dev_layer = lctx.model.dev_layer(il);
|
10475
8176
|
for (auto & backend : lctx.backends) {
|
10476
|
-
if (lm_ggml_backend_get_device(backend.get()) == dev_layer
|
8177
|
+
if (lm_ggml_backend_get_device(backend.get()) == dev_layer) {
|
10477
8178
|
if (lm_ggml_backend_supports_op(backend.get(), cur)) {
|
10478
8179
|
lm_ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
|
10479
8180
|
}
|
@@ -10561,6 +8262,7 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
10561
8262
|
result = llm.build_phi2();
|
10562
8263
|
} break;
|
10563
8264
|
case LLM_ARCH_PHI3:
|
8265
|
+
case LLM_ARCH_PHIMOE:
|
10564
8266
|
{
|
10565
8267
|
result = llm.build_phi3();
|
10566
8268
|
} break;
|
@@ -10688,6 +8390,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
10688
8390
|
{
|
10689
8391
|
result = llm.build_rwkv6();
|
10690
8392
|
} break;
|
8393
|
+
case LLM_ARCH_RWKV6QWEN2:
|
8394
|
+
{
|
8395
|
+
result = llm.build_rwkv6qwen2();
|
8396
|
+
} break;
|
10691
8397
|
case LLM_ARCH_CHAMELEON:
|
10692
8398
|
{
|
10693
8399
|
result = llm.build_chameleon();
|
@@ -10767,6 +8473,7 @@ static int llama_decode_impl(
|
|
10767
8473
|
const uint32_t n_tokens_all = batch.n_tokens;
|
10768
8474
|
|
10769
8475
|
const auto & model = lctx.model;
|
8476
|
+
const auto & vocab = model.vocab;
|
10770
8477
|
const auto & hparams = model.hparams;
|
10771
8478
|
const auto & cparams = lctx.cparams;
|
10772
8479
|
|
@@ -10774,7 +8481,7 @@ static int llama_decode_impl(
|
|
10774
8481
|
|
10775
8482
|
if (batch.token) {
|
10776
8483
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10777
|
-
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.
|
8484
|
+
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
10778
8485
|
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
10779
8486
|
return -1;
|
10780
8487
|
}
|
@@ -10794,7 +8501,7 @@ static int llama_decode_impl(
|
|
10794
8501
|
llama_kv_slot_restorer kv_slot_restorer(kv_self);
|
10795
8502
|
|
10796
8503
|
const int64_t n_embd = hparams.n_embd;
|
10797
|
-
const int64_t n_vocab =
|
8504
|
+
const int64_t n_vocab = vocab.n_tokens();
|
10798
8505
|
|
10799
8506
|
uint32_t n_outputs = 0;
|
10800
8507
|
uint32_t n_outputs_prev = 0;
|
@@ -11109,7 +8816,7 @@ static int llama_encode_impl(
|
|
11109
8816
|
|
11110
8817
|
if (batch.token) {
|
11111
8818
|
for (uint32_t i = 0; i < n_tokens; ++i) {
|
11112
|
-
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.
|
8819
|
+
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
11113
8820
|
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
11114
8821
|
return -1;
|
11115
8822
|
}
|
@@ -11286,9 +8993,9 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
|
11286
8993
|
// each move requires 6*n_layer tensors (see build_defrag)
|
11287
8994
|
// - source view, destination view, copy operation
|
11288
8995
|
// - x2 for keys and values
|
11289
|
-
//const uint32_t max_moves =
|
8996
|
+
//const uint32_t max_moves = model.max_nodes()/(6*n_layer);
|
11290
8997
|
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
11291
|
-
const uint32_t max_moves = (
|
8998
|
+
const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer);
|
11292
8999
|
|
11293
9000
|
// determine which KV cells to move where
|
11294
9001
|
//
|
@@ -11535,7 +9242,7 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
|
|
11535
9242
|
// build worst-case graph
|
11536
9243
|
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
11537
9244
|
uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
|
11538
|
-
llama_token token =
|
9245
|
+
llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
11539
9246
|
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
11540
9247
|
lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
|
11541
9248
|
|
@@ -11547,39 +9254,38 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
|
|
11547
9254
|
}
|
11548
9255
|
}
|
11549
9256
|
|
11550
|
-
int32_t
|
9257
|
+
int32_t llama_set_adapter_lora(
|
11551
9258
|
struct llama_context * ctx,
|
11552
|
-
struct
|
9259
|
+
struct llama_adapter_lora * adapter,
|
11553
9260
|
float scale) {
|
11554
|
-
ctx->
|
9261
|
+
ctx->lora[adapter] = scale;
|
11555
9262
|
return 0;
|
11556
9263
|
}
|
11557
9264
|
|
11558
|
-
int32_t
|
9265
|
+
int32_t llama_rm_adapter_lora(
|
11559
9266
|
struct llama_context * ctx,
|
11560
|
-
struct
|
11561
|
-
auto pos = ctx->
|
11562
|
-
if (pos != ctx->
|
11563
|
-
ctx->
|
9267
|
+
struct llama_adapter_lora * adapter) {
|
9268
|
+
auto pos = ctx->lora.find(adapter);
|
9269
|
+
if (pos != ctx->lora.end()) {
|
9270
|
+
ctx->lora.erase(pos);
|
11564
9271
|
return 0;
|
11565
9272
|
}
|
11566
9273
|
|
11567
9274
|
return -1;
|
11568
9275
|
}
|
11569
9276
|
|
11570
|
-
void
|
11571
|
-
ctx->
|
9277
|
+
void llama_clear_adapter_lora(struct llama_context * ctx) {
|
9278
|
+
ctx->lora.clear();
|
11572
9279
|
}
|
11573
9280
|
|
11574
|
-
|
11575
|
-
|
11576
|
-
struct llama_context * lctx,
|
9281
|
+
int32_t llama_apply_adapter_cvec(
|
9282
|
+
struct llama_context * ctx,
|
11577
9283
|
const float * data,
|
11578
9284
|
size_t len,
|
11579
9285
|
int32_t n_embd,
|
11580
9286
|
int32_t il_start,
|
11581
9287
|
int32_t il_end) {
|
11582
|
-
return
|
9288
|
+
return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
|
11583
9289
|
}
|
11584
9290
|
|
11585
9291
|
//
|
@@ -11679,18 +9385,13 @@ int64_t llama_time_us(void) {
|
|
11679
9385
|
return lm_ggml_time_us();
|
11680
9386
|
}
|
11681
9387
|
|
11682
|
-
struct llama_model *
|
11683
|
-
const
|
11684
|
-
|
11685
|
-
return llama_model_load_from_file(path_model, params);
|
11686
|
-
}
|
11687
|
-
|
11688
|
-
struct llama_model * llama_model_load_from_file(
|
11689
|
-
const char * path_model,
|
9388
|
+
static struct llama_model * llama_model_load_from_file_impl(
|
9389
|
+
const std::string & path_model,
|
9390
|
+
std::vector<std::string> & splits,
|
11690
9391
|
struct llama_model_params params) {
|
11691
9392
|
lm_ggml_time_init();
|
11692
9393
|
|
11693
|
-
llama_model * model = new llama_model;
|
9394
|
+
llama_model * model = new llama_model(params);
|
11694
9395
|
|
11695
9396
|
unsigned cur_percentage = 0;
|
11696
9397
|
if (params.progress_callback == NULL) {
|
@@ -11709,47 +9410,6 @@ struct llama_model * llama_model_load_from_file(
|
|
11709
9410
|
};
|
11710
9411
|
}
|
11711
9412
|
|
11712
|
-
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
11713
|
-
// split the servers set them into model->rpc_servers
|
11714
|
-
std::string servers(params.rpc_servers);
|
11715
|
-
size_t pos = 0;
|
11716
|
-
while ((pos = servers.find(',')) != std::string::npos) {
|
11717
|
-
std::string server = servers.substr(0, pos);
|
11718
|
-
model->rpc_servers.push_back(server);
|
11719
|
-
servers.erase(0, pos + 1);
|
11720
|
-
}
|
11721
|
-
model->rpc_servers.push_back(servers);
|
11722
|
-
}
|
11723
|
-
|
11724
|
-
// add RPC devices
|
11725
|
-
if (!model->rpc_servers.empty()) {
|
11726
|
-
lm_ggml_backend_reg_t rpc_reg = lm_ggml_backend_reg_by_name("RPC");
|
11727
|
-
if (!rpc_reg) {
|
11728
|
-
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
|
11729
|
-
llama_model_free(model);
|
11730
|
-
return nullptr;
|
11731
|
-
}
|
11732
|
-
|
11733
|
-
typedef lm_ggml_backend_dev_t (*lm_ggml_backend_rpc_add_device_t)(const char * endpoint);
|
11734
|
-
lm_ggml_backend_rpc_add_device_t lm_ggml_backend_rpc_add_device_fn = (lm_ggml_backend_rpc_add_device_t) lm_ggml_backend_reg_get_proc_address(rpc_reg, "lm_ggml_backend_rpc_add_device");
|
11735
|
-
if (!lm_ggml_backend_rpc_add_device_fn) {
|
11736
|
-
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
|
11737
|
-
llama_model_free(model);
|
11738
|
-
return nullptr;
|
11739
|
-
}
|
11740
|
-
|
11741
|
-
for (const std::string & server : model->rpc_servers) {
|
11742
|
-
lm_ggml_backend_dev_t dev = lm_ggml_backend_rpc_add_device_fn(server.c_str());
|
11743
|
-
if (dev) {
|
11744
|
-
model->devices.push_back(dev);
|
11745
|
-
} else {
|
11746
|
-
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
11747
|
-
llama_model_free(model);
|
11748
|
-
return nullptr;
|
11749
|
-
}
|
11750
|
-
}
|
11751
|
-
}
|
11752
|
-
|
11753
9413
|
// create list of devices to use with this model
|
11754
9414
|
if (params.devices) {
|
11755
9415
|
for (lm_ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
@@ -11790,7 +9450,7 @@ struct llama_model * llama_model_load_from_file(
|
|
11790
9450
|
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, lm_ggml_backend_dev_name(dev), lm_ggml_backend_dev_description(dev), free/1024/1024);
|
11791
9451
|
}
|
11792
9452
|
|
11793
|
-
int status = llama_model_load(path_model, *model, params);
|
9453
|
+
const int status = llama_model_load(path_model, splits, *model, params);
|
11794
9454
|
LM_GGML_ASSERT(status <= 0);
|
11795
9455
|
if (status < 0) {
|
11796
9456
|
if (status == -1) {
|
@@ -11806,7 +9466,36 @@ struct llama_model * llama_model_load_from_file(
|
|
11806
9466
|
return model;
|
11807
9467
|
}
|
11808
9468
|
|
11809
|
-
|
9469
|
+
// deprecated
|
9470
|
+
struct llama_model * llama_load_model_from_file(
|
9471
|
+
const char * path_model,
|
9472
|
+
struct llama_model_params params) {
|
9473
|
+
return llama_model_load_from_file(path_model, params);
|
9474
|
+
}
|
9475
|
+
|
9476
|
+
struct llama_model * llama_model_load_from_file(
|
9477
|
+
const char * path_model,
|
9478
|
+
struct llama_model_params params) {
|
9479
|
+
std::vector<std::string> splits = {};
|
9480
|
+
return llama_model_load_from_file_impl(path_model, splits, params);
|
9481
|
+
}
|
9482
|
+
|
9483
|
+
struct llama_model * llama_model_load_from_splits(
|
9484
|
+
const char ** paths,
|
9485
|
+
size_t n_paths,
|
9486
|
+
struct llama_model_params params) {
|
9487
|
+
std::vector<std::string> splits;
|
9488
|
+
if (n_paths == 0) {
|
9489
|
+
LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
|
9490
|
+
return nullptr;
|
9491
|
+
}
|
9492
|
+
for (size_t i = 0; i < n_paths; ++i) {
|
9493
|
+
splits.push_back(paths[i]);
|
9494
|
+
}
|
9495
|
+
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
9496
|
+
}
|
9497
|
+
|
9498
|
+
struct llama_context * llama_init_from_model(
|
11810
9499
|
struct llama_model * model,
|
11811
9500
|
struct llama_context_params params) {
|
11812
9501
|
|
@@ -12064,7 +9753,7 @@ struct llama_context * llama_new_context_with_model(
|
|
12064
9753
|
backend_ptrs.push_back(backend.get());
|
12065
9754
|
}
|
12066
9755
|
|
12067
|
-
const size_t max_nodes =
|
9756
|
+
const size_t max_nodes = model->max_nodes();
|
12068
9757
|
|
12069
9758
|
// buffer used to store the computation graph and the tensor meta data
|
12070
9759
|
ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*max_nodes + lm_ggml_graph_overhead_custom(max_nodes, false));
|
@@ -12072,9 +9761,9 @@ struct llama_context * llama_new_context_with_model(
|
|
12072
9761
|
// TODO: move these checks to lm_ggml_backend_sched
|
12073
9762
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
12074
9763
|
bool pipeline_parallel =
|
12075
|
-
|
12076
|
-
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
12077
|
-
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
9764
|
+
model->n_devices() > 1 &&
|
9765
|
+
model->params.n_gpu_layers > (int)model->hparams.n_layer &&
|
9766
|
+
model->params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
12078
9767
|
params.offload_kqv;
|
12079
9768
|
|
12080
9769
|
// pipeline parallelism requires support for async compute and events in all devices
|
@@ -12105,7 +9794,7 @@ struct llama_context * llama_new_context_with_model(
|
|
12105
9794
|
// initialize scheduler with the worst-case graph
|
12106
9795
|
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
12107
9796
|
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
12108
|
-
llama_token token =
|
9797
|
+
llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
12109
9798
|
|
12110
9799
|
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
12111
9800
|
lm_ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
|
@@ -12157,6 +9846,12 @@ struct llama_context * llama_new_context_with_model(
|
|
12157
9846
|
return ctx;
|
12158
9847
|
}
|
12159
9848
|
|
9849
|
+
struct llama_context * llama_new_context_with_model(
|
9850
|
+
struct llama_model * model,
|
9851
|
+
struct llama_context_params params) {
|
9852
|
+
return llama_init_from_model(model, params);
|
9853
|
+
}
|
9854
|
+
|
12160
9855
|
//
|
12161
9856
|
// kv cache
|
12162
9857
|
//
|
@@ -12254,166 +9949,18 @@ int32_t llama_decode(
|
|
12254
9949
|
return ret;
|
12255
9950
|
}
|
12256
9951
|
|
12257
|
-
//
|
12258
|
-
// vocab
|
12259
|
-
//
|
12260
|
-
|
12261
|
-
// TODO: tmp bridges below until `struct llama_vocab` is exposed through the public API
|
12262
|
-
|
12263
|
-
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
12264
|
-
return llama_token_get_text_impl(model->vocab, token);
|
12265
|
-
}
|
12266
|
-
|
12267
|
-
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
12268
|
-
return llama_token_get_score_impl(model->vocab, token);
|
12269
|
-
}
|
12270
|
-
|
12271
|
-
enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
12272
|
-
return llama_token_get_attr_impl(model->vocab, token);
|
12273
|
-
}
|
12274
|
-
|
12275
|
-
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
12276
|
-
return llama_token_is_eog_impl(model->vocab, token);
|
12277
|
-
}
|
12278
|
-
|
12279
|
-
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
12280
|
-
return llama_token_is_control_impl(model->vocab, token);
|
12281
|
-
}
|
12282
|
-
|
12283
|
-
llama_token llama_token_bos(const struct llama_model * model) {
|
12284
|
-
return llama_token_bos_impl(model->vocab);
|
12285
|
-
}
|
12286
|
-
|
12287
|
-
llama_token llama_token_eos(const struct llama_model * model) {
|
12288
|
-
return llama_token_eos_impl(model->vocab);
|
12289
|
-
}
|
12290
|
-
|
12291
|
-
llama_token llama_token_eot(const struct llama_model * model) {
|
12292
|
-
return llama_token_eot_impl(model->vocab);
|
12293
|
-
}
|
12294
|
-
|
12295
|
-
llama_token llama_token_cls(const struct llama_model * model) {
|
12296
|
-
return llama_token_cls_impl(model->vocab);
|
12297
|
-
}
|
12298
|
-
|
12299
|
-
llama_token llama_token_sep(const struct llama_model * model) {
|
12300
|
-
return llama_token_sep_impl(model->vocab);
|
12301
|
-
}
|
12302
|
-
|
12303
|
-
llama_token llama_token_nl (const struct llama_model * model) {
|
12304
|
-
return llama_token_nl_impl(model->vocab);
|
12305
|
-
}
|
12306
|
-
|
12307
|
-
llama_token llama_token_pad(const struct llama_model * model) {
|
12308
|
-
return llama_token_pad_impl(model->vocab);
|
12309
|
-
}
|
12310
|
-
|
12311
|
-
bool llama_add_bos_token(const struct llama_model * model) {
|
12312
|
-
return llama_add_bos_token_impl(model->vocab);
|
12313
|
-
}
|
12314
|
-
|
12315
|
-
bool llama_add_eos_token(const struct llama_model * model) {
|
12316
|
-
return llama_add_eos_token_impl(model->vocab);
|
12317
|
-
}
|
12318
|
-
|
12319
|
-
llama_token llama_token_prefix(const struct llama_model * model) {
|
12320
|
-
return llama_token_prefix_impl(model->vocab);
|
12321
|
-
}
|
12322
|
-
|
12323
|
-
llama_token llama_token_middle(const struct llama_model * model) {
|
12324
|
-
return llama_token_middle_impl(model->vocab);
|
12325
|
-
}
|
12326
|
-
|
12327
|
-
llama_token llama_token_suffix(const struct llama_model * model) {
|
12328
|
-
return llama_token_suffix_impl(model->vocab);
|
12329
|
-
}
|
12330
|
-
|
12331
|
-
llama_token llama_token_fim_pre(const struct llama_model * model) {
|
12332
|
-
return llama_token_fim_pre_impl(model->vocab);
|
12333
|
-
}
|
12334
|
-
|
12335
|
-
llama_token llama_token_fim_suf(const struct llama_model * model) {
|
12336
|
-
return llama_token_fim_suf_impl(model->vocab);
|
12337
|
-
}
|
12338
|
-
|
12339
|
-
llama_token llama_token_fim_mid(const struct llama_model * model) {
|
12340
|
-
return llama_token_fim_mid_impl(model->vocab);
|
12341
|
-
}
|
12342
|
-
|
12343
|
-
llama_token llama_token_fim_pad(const struct llama_model * model) {
|
12344
|
-
return llama_token_fim_pad_impl(model->vocab);
|
12345
|
-
}
|
12346
|
-
|
12347
|
-
llama_token llama_token_fim_rep(const struct llama_model * model) {
|
12348
|
-
return llama_token_fim_rep_impl(model->vocab);
|
12349
|
-
}
|
12350
|
-
|
12351
|
-
llama_token llama_token_fim_sep(const struct llama_model * model) {
|
12352
|
-
return llama_token_fim_sep_impl(model->vocab);
|
12353
|
-
}
|
12354
|
-
|
12355
|
-
//
|
12356
|
-
// tokenization
|
12357
|
-
//
|
12358
|
-
|
12359
|
-
int32_t llama_tokenize(
|
12360
|
-
const struct llama_model * model,
|
12361
|
-
const char * text,
|
12362
|
-
int32_t text_len,
|
12363
|
-
llama_token * tokens,
|
12364
|
-
int32_t n_tokens_max,
|
12365
|
-
bool add_special,
|
12366
|
-
bool parse_special) {
|
12367
|
-
return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
|
12368
|
-
}
|
12369
|
-
|
12370
|
-
int32_t llama_token_to_piece(
|
12371
|
-
const struct llama_model * model,
|
12372
|
-
llama_token token,
|
12373
|
-
char * buf,
|
12374
|
-
int32_t length,
|
12375
|
-
int32_t lstrip,
|
12376
|
-
bool special) {
|
12377
|
-
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
|
12378
|
-
}
|
12379
|
-
|
12380
|
-
int32_t llama_detokenize(
|
12381
|
-
const struct llama_model * model,
|
12382
|
-
const llama_token * tokens,
|
12383
|
-
int32_t n_tokens,
|
12384
|
-
char * text,
|
12385
|
-
int32_t text_len_max,
|
12386
|
-
bool remove_special,
|
12387
|
-
bool unparse_special) {
|
12388
|
-
return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
|
12389
|
-
}
|
12390
|
-
|
12391
9952
|
//
|
12392
9953
|
// chat templates
|
12393
9954
|
//
|
12394
9955
|
|
12395
9956
|
int32_t llama_chat_apply_template(
|
12396
|
-
const struct llama_model * model,
|
12397
9957
|
const char * tmpl,
|
12398
9958
|
const struct llama_chat_message * chat,
|
12399
9959
|
size_t n_msg,
|
12400
9960
|
bool add_ass,
|
12401
9961
|
char * buf,
|
12402
9962
|
int32_t length) {
|
12403
|
-
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
12404
|
-
if (tmpl == nullptr) {
|
12405
|
-
LM_GGML_ASSERT(model != nullptr);
|
12406
|
-
|
12407
|
-
// load template from model, if available
|
12408
|
-
const auto & it = model->lm_gguf_kv.find("tokenizer.chat_template");
|
12409
|
-
if (it != model->lm_gguf_kv.end() && it->second.size() > 0) {
|
12410
|
-
curr_tmpl = it->second;
|
12411
|
-
}
|
12412
|
-
else {
|
12413
|
-
// worst case: there is no information about template, we will use chatml by default
|
12414
|
-
curr_tmpl = "chatml"; // see llm_chat_apply_template
|
12415
|
-
}
|
12416
|
-
}
|
9963
|
+
const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
|
12417
9964
|
|
12418
9965
|
// format the chat to string
|
12419
9966
|
std::vector<const llama_chat_message *> chat_vec;
|
@@ -12437,23 +9984,6 @@ int32_t llama_chat_apply_template(
|
|
12437
9984
|
return res;
|
12438
9985
|
}
|
12439
9986
|
|
12440
|
-
//
|
12441
|
-
// sampling
|
12442
|
-
//
|
12443
|
-
|
12444
|
-
// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
|
12445
|
-
struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
|
12446
|
-
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
12447
|
-
}
|
12448
|
-
|
12449
|
-
struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
|
12450
|
-
return llama_sampler_init_infill_impl(model->vocab);
|
12451
|
-
}
|
12452
|
-
|
12453
|
-
struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
12454
|
-
return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
|
12455
|
-
}
|
12456
|
-
|
12457
9987
|
//
|
12458
9988
|
// model split
|
12459
9989
|
//
|