gpt_neox_client 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,814 @@
1
+ #include "ggml/ggml.h"
2
+
3
+ #include "common.h"
4
+ #include "common-ggml.h"
5
+
6
+ #include <cassert>
7
+ #include <cmath>
8
+ #include <cstdio>
9
+ #include <cstring>
10
+ #include <cinttypes>
11
+ #include <fstream>
12
+ #include <map>
13
+ #include <string>
14
+ #include <vector>
15
+
16
+ #if defined(_MSC_VER)
17
+ #pragma warning(disable: 4244 4267) // possible loss of data
18
+ #endif
19
+
20
+ // default hparams (StableLM 3B)
21
+ struct gpt_neox_hparams {
22
+ int32_t n_vocab = 50257;
23
+ int32_t n_ctx = 4096;
24
+ int32_t n_embd = 4096;
25
+ int32_t n_head = 32;
26
+ int32_t n_layer = 16;
27
+ int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
28
+ int32_t par_res = 1; // 1 = true, 0 = false
29
+ int32_t ftype = 1;
30
+ float eps = 1e-5;
31
+ };
32
+
33
+ struct gpt_neox_layer {
34
+ // pre normalization
35
+ struct ggml_tensor * ln_1_g;
36
+ struct ggml_tensor * ln_1_b;
37
+
38
+ // attention
39
+ struct ggml_tensor * c_attn_attn_w;
40
+ struct ggml_tensor * c_attn_attn_b;
41
+
42
+ struct ggml_tensor * c_attn_proj_w;
43
+ struct ggml_tensor * c_attn_proj_b;
44
+
45
+ // post normalization
46
+ struct ggml_tensor * ln_2_g;
47
+ struct ggml_tensor * ln_2_b;
48
+
49
+ // ff
50
+ struct ggml_tensor * c_mlp_fc_w;
51
+ struct ggml_tensor * c_mlp_fc_b;
52
+
53
+ struct ggml_tensor * c_mlp_proj_w;
54
+ struct ggml_tensor * c_mlp_proj_b;
55
+ };
56
+
57
+ struct gpt_neox_model {
58
+ gpt_neox_hparams hparams;
59
+
60
+ // normalization
61
+ struct ggml_tensor * ln_f_g;
62
+ struct ggml_tensor * ln_f_b;
63
+
64
+ struct ggml_tensor * wte; // position embedding
65
+
66
+ struct ggml_tensor * lmh_g; // language model head
67
+ //struct ggml_tensor * lmh_b; // language model bias
68
+
69
+ std::vector<gpt_neox_layer> layers;
70
+
71
+ // key + value memory
72
+ struct ggml_tensor * memory_k;
73
+ struct ggml_tensor * memory_v;
74
+
75
+ //
76
+ struct ggml_context * ctx;
77
+ std::map<std::string, struct ggml_tensor *> tensors;
78
+ };
79
+
80
+ // load the model's weights from a file
81
+ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) {
82
+ printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
83
+
84
+ auto fin = std::ifstream(fname, std::ios::binary);
85
+ if (!fin) {
86
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
87
+ return false;
88
+ }
89
+
90
+ // verify magic
91
+ {
92
+ uint32_t magic;
93
+ fin.read((char *) &magic, sizeof(magic));
94
+ if (magic != GGML_FILE_MAGIC) {
95
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
96
+ return false;
97
+ }
98
+ }
99
+
100
+ // load hparams
101
+ {
102
+ auto & hparams = model.hparams;
103
+
104
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
105
+ fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
106
+ fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
107
+ fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
108
+ fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
109
+ fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
110
+ fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
111
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
112
+
113
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
114
+
115
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
116
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
117
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
118
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
119
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
120
+ printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
121
+ printf("%s: par_res = %d\n", __func__, hparams.par_res);
122
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
123
+ printf("%s: qntvr = %d\n", __func__, qntvr);
124
+
125
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
126
+ }
127
+
128
+ // load vocab
129
+ {
130
+ const int32_t n_vocab = model.hparams.n_vocab;
131
+
132
+ std::string word;
133
+ std::vector<char> buf(128);
134
+
135
+ for (int i = 0; i < n_vocab; i++) {
136
+ uint32_t len;
137
+ fin.read((char *) &len, sizeof(len));
138
+
139
+ buf.resize(len);
140
+ fin.read((char *) buf.data(), len);
141
+ word.assign(buf.data(), len);
142
+
143
+ vocab.token_to_id[word] = i;
144
+ vocab.id_to_token[i] = word;
145
+ }
146
+ }
147
+
148
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
149
+ // in order to save memory and also to speed up the computation
150
+ ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
151
+ if (wtype == GGML_TYPE_COUNT) {
152
+ fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
153
+ __func__, fname.c_str(), model.hparams.ftype);
154
+ return false;
155
+ }
156
+
157
+ auto & ctx = model.ctx;
158
+
159
+ size_t ctx_size = 0;
160
+
161
+ {
162
+ const auto & hparams = model.hparams;
163
+
164
+ const size_t n_embd = hparams.n_embd;
165
+ const size_t n_layer = hparams.n_layer;
166
+ const size_t n_ctx = hparams.n_ctx;
167
+ const size_t n_vocab = hparams.n_vocab;
168
+
169
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
170
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
171
+
172
+ ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
173
+
174
+ ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
175
+ //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
176
+
177
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
178
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
179
+
180
+ ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
181
+ ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
182
+
183
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
184
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
185
+
186
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
187
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
188
+
189
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
190
+ ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
191
+
192
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
193
+ ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
194
+
195
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
196
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
197
+
198
+ ctx_size += (6 + 16*n_layer)*1024; // object overhead
199
+
200
+ printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
201
+ }
202
+
203
+ // create the ggml context
204
+ {
205
+ struct ggml_init_params params = {
206
+ /*.mem_size =*/ ctx_size,
207
+ /*.mem_buffer =*/ NULL,
208
+ /*.no_alloc =*/ false,
209
+ };
210
+
211
+ model.ctx = ggml_init(params);
212
+ if (!model.ctx) {
213
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
214
+ return false;
215
+ }
216
+ }
217
+
218
+ // prepare memory for the weights
219
+ {
220
+ const auto & hparams = model.hparams;
221
+
222
+ const int n_embd = hparams.n_embd;
223
+ const int n_layer = hparams.n_layer;
224
+ const int n_vocab = hparams.n_vocab;
225
+
226
+ model.layers.resize(n_layer);
227
+
228
+ model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
229
+
230
+ model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
231
+ model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
232
+
233
+ model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
234
+ //model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
235
+
236
+ // map by name
237
+ model.tensors["gpt_neox.embed_in.weight"] = model.wte;
238
+
239
+ model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
240
+ model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b;
241
+
242
+ model.tensors["embed_out.weight"] = model.lmh_g;
243
+ //model.tensors["lm_head.bias"] = model.lmh_b;
244
+
245
+ for (int i = 0; i < n_layer; ++i) {
246
+ auto & layer = model.layers[i];
247
+
248
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
249
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
250
+
251
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
252
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
253
+
254
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
255
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
256
+
257
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
258
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
259
+
260
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
261
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
262
+
263
+ layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
264
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
265
+
266
+ // map by name
267
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
268
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"] = layer.ln_1_b;
269
+
270
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
271
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"] = layer.c_attn_attn_b;
272
+
273
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
274
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"] = layer.c_attn_proj_b;
275
+
276
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
277
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"] = layer.ln_2_b;
278
+
279
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
280
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"] = layer.c_mlp_fc_b;
281
+
282
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
283
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"] = layer.c_mlp_proj_b;
284
+ }
285
+ }
286
+
287
+ // key + value memory
288
+ {
289
+ const auto & hparams = model.hparams;
290
+
291
+ const int n_embd = hparams.n_embd;
292
+ const int n_layer = hparams.n_layer;
293
+ const int n_ctx = hparams.n_ctx;
294
+
295
+ const int64_t n_mem = n_layer*n_ctx;
296
+ const int64_t n_elements = n_embd*n_mem;
297
+
298
+ model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
299
+ model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
300
+
301
+ const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
302
+
303
+ printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
304
+ }
305
+
306
+ // load weights
307
+ {
308
+ int n_tensors = 0;
309
+ size_t total_size = 0;
310
+
311
+ printf("%s: ", __func__);
312
+
313
+ while (true) {
314
+ int32_t n_dims;
315
+ int32_t length;
316
+ int32_t ttype;
317
+
318
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
319
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
320
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
321
+
322
+ if (fin.eof()) {
323
+ break;
324
+ }
325
+
326
+ int32_t nelements = 1;
327
+ int32_t ne[2] = { 1, 1 };
328
+ for (int i = 0; i < n_dims; ++i) {
329
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
330
+ nelements *= ne[i];
331
+ }
332
+
333
+ std::string name(length, 0);
334
+ fin.read(&name[0], length);
335
+
336
+ if (model.tensors.find(name) == model.tensors.end()) {
337
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
338
+ return false;
339
+ }
340
+
341
+ auto tensor = model.tensors[name];
342
+ if (ggml_nelements(tensor) != nelements) {
343
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
344
+ return false;
345
+ }
346
+
347
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
348
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
349
+ __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
350
+ return false;
351
+ }
352
+
353
+ // for debugging
354
+ if (0) {
355
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
356
+ }
357
+
358
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
359
+
360
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
361
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
362
+ __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
363
+ return false;
364
+ }
365
+
366
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
367
+
368
+ total_size += ggml_nbytes(tensor);
369
+ if (++n_tensors % 8 == 0) {
370
+ printf(".");
371
+ fflush(stdout);
372
+ }
373
+ }
374
+
375
+ printf(" done\n");
376
+
377
+ printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
378
+ }
379
+
380
+ fin.close();
381
+
382
+ return true;
383
+ }
384
+
385
+
386
+ // feed-forward network
387
+ ggml_tensor * gpt_neox_ff(
388
+ const gpt_neox_layer & layer,
389
+ ggml_context * ctx0,
390
+ ggml_tensor * inp,
391
+ float eps) {
392
+ ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
393
+
394
+ cur = ggml_add(ctx0,
395
+ ggml_mul(ctx0,
396
+ ggml_repeat(ctx0, layer.ln_2_g, cur),
397
+ cur),
398
+ ggml_repeat(ctx0, layer.ln_2_b, cur));
399
+
400
+ cur = ggml_mul_mat(ctx0,
401
+ layer.c_mlp_fc_w,
402
+ cur);
403
+
404
+ cur = ggml_add(ctx0,
405
+ ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
406
+ cur);
407
+
408
+ // GELU activation
409
+ cur = ggml_gelu(ctx0, cur);
410
+
411
+ // projection
412
+ // cur = proj_w*cur + proj_b
413
+ cur = ggml_mul_mat(ctx0,
414
+ layer.c_mlp_proj_w,
415
+ cur);
416
+
417
+ cur = ggml_add(ctx0,
418
+ ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
419
+ cur);
420
+ return cur;
421
+ }
422
+
423
+ // evaluate the transformer
424
+ //
425
+ // - model: the model
426
+ // - n_threads: number of threads to use
427
+ // - n_past: the context size so far
428
+ // - embd_inp: the embeddings of the tokens in the context
429
+ // - embd_w: the predicted logits for the next token
430
+ //
431
+ bool gpt_neox_eval(
432
+ const gpt_neox_model & model,
433
+ const int n_threads,
434
+ const int n_past,
435
+ const std::vector<gpt_vocab::id> & embd_inp,
436
+ std::vector<float> & embd_w,
437
+ size_t & mem_per_token) {
438
+ const int N = embd_inp.size();
439
+
440
+ const auto & hparams = model.hparams;
441
+
442
+ const int n_embd = hparams.n_embd;
443
+ const int n_layer = hparams.n_layer;
444
+ const int n_ctx = hparams.n_ctx;
445
+ const int n_head = hparams.n_head;
446
+ const int n_vocab = hparams.n_vocab;
447
+ const int n_rot = hparams.n_rot;
448
+
449
+ static size_t buf_size = 256u*1024*1024;
450
+ static void * buf = malloc(buf_size);
451
+
452
+ // use 2 scratch buffers
453
+ // TODO: very hacky solution - reimplement in a more elegant way
454
+ static size_t scr0_size = 256u*1024*1024;
455
+ static void * scr0 = malloc(scr0_size);
456
+
457
+ static size_t scr1_size = 256u*1024*1024;
458
+ static void * scr1 = malloc(scr1_size);
459
+
460
+ if (mem_per_token > 0 && mem_per_token*N > buf_size) {
461
+ const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
462
+ //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
463
+
464
+ // reallocate
465
+ buf_size = buf_size_new;
466
+ buf = realloc(buf, buf_size);
467
+ if (buf == nullptr) {
468
+ fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
469
+ return false;
470
+ }
471
+ }
472
+
473
+ struct ggml_init_params params = {
474
+ /*.mem_size =*/ buf_size,
475
+ /*.mem_buffer =*/ buf,
476
+ /*.no_alloc =*/ false,
477
+ };
478
+
479
+ struct ggml_context * ctx0 = ggml_init(params);
480
+ struct ggml_cgraph gf = {};
481
+
482
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
483
+ memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
484
+
485
+ // wte
486
+ struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
487
+
488
+ for (int il = 0; il < n_layer; ++il) {
489
+ struct ggml_tensor * cur;
490
+
491
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
492
+
493
+ // self-attention
494
+ {
495
+ {
496
+ cur = ggml_norm(ctx0, inpL, hparams.eps);
497
+
498
+ cur = ggml_add(ctx0,
499
+ ggml_mul(ctx0,
500
+ ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
501
+ cur),
502
+ ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
503
+ }
504
+
505
+ // compute QKV
506
+ {
507
+ cur = ggml_mul_mat(ctx0,
508
+ model.layers[il].c_attn_attn_w,
509
+ cur);
510
+
511
+ cur = ggml_add(ctx0,
512
+ ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
513
+ cur);
514
+ }
515
+
516
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
517
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
518
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
519
+
520
+ // using mode = 2 for GPT-NeoX mode
521
+ Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
522
+ Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
523
+
524
+ // store key and value to memory
525
+ {
526
+ Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
527
+
528
+ struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
529
+ struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
530
+ ( n_ctx)*ggml_element_size(model.memory_v),
531
+ (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
532
+
533
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
534
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
535
+ }
536
+
537
+ // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
538
+ struct ggml_tensor * Q =
539
+ ggml_permute(ctx0,
540
+ Qcur,
541
+ 0, 2, 1, 3);
542
+
543
+ // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
544
+ struct ggml_tensor * K =
545
+ ggml_permute(ctx0,
546
+ ggml_reshape_3d(ctx0,
547
+ ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
548
+ n_embd/n_head, n_head, n_past + N),
549
+ 0, 2, 1, 3);
550
+
551
+ // K * Q
552
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
553
+
554
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
555
+ struct ggml_tensor * KQ_scaled =
556
+ ggml_scale_inplace(ctx0,
557
+ KQ,
558
+ ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
559
+ );
560
+
561
+ // KQ_masked = mask_past(KQ_scaled)
562
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
563
+
564
+ // KQ = soft_max(KQ_masked)
565
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
566
+
567
+ // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
568
+ struct ggml_tensor * V =
569
+ ggml_view_3d(ctx0, model.memory_v,
570
+ n_past + N, n_embd/n_head, n_head,
571
+ n_ctx*ggml_element_size(model.memory_v),
572
+ n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
573
+ il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
574
+
575
+ // KQV = transpose(V) * KQ_soft_max
576
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
577
+
578
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
579
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
580
+
581
+ // cur = KQV_merged.contiguous().view(n_embd, N)
582
+ cur = ggml_cpy(ctx0,
583
+ KQV_merged,
584
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
585
+
586
+ // projection
587
+ {
588
+ cur = ggml_mul_mat(ctx0,
589
+ model.layers[il].c_attn_proj_w,
590
+ cur);
591
+
592
+ cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
593
+ }
594
+ }
595
+
596
+ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
597
+
598
+ if (hparams.par_res == 0) {
599
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
600
+
601
+ cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
602
+
603
+ // input for next layer
604
+ inpL = ggml_add(ctx0, cur, inpFF);
605
+ } else {
606
+ struct ggml_tensor * inpFF = cur;
607
+
608
+ // this is independent of the self-attention result, so it could be done in parallel to the self-attention
609
+ // note here we pass inpL instead of cur
610
+ cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
611
+
612
+ // layer input + FF
613
+ cur = ggml_add(ctx0, cur, inpFF);
614
+
615
+ // input for next layer
616
+ inpL = ggml_add(ctx0, cur, inpL);
617
+ }
618
+ }
619
+
620
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
621
+
622
+ // norm
623
+ {
624
+ inpL = ggml_norm(ctx0, inpL, hparams.eps);
625
+
626
+ // inpL = ln_f_g*inpL + ln_f_b
627
+ inpL = ggml_add(ctx0,
628
+ ggml_mul(ctx0,
629
+ ggml_repeat(ctx0, model.ln_f_g, inpL),
630
+ inpL),
631
+ ggml_repeat(ctx0, model.ln_f_b, inpL));
632
+ }
633
+
634
+ ggml_set_scratch(ctx0, { 0, 0, nullptr, });
635
+
636
+ // lm_head
637
+ {
638
+ inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
639
+
640
+ //inpL = ggml_add(ctx0,
641
+ // ggml_repeat(ctx0, model.lmh_b, inpL),
642
+ // inpL);
643
+ }
644
+
645
+ // logits -> probs
646
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
647
+
648
+ // run the computation
649
+ ggml_build_forward_expand(&gf, inpL);
650
+ ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
651
+
652
+ //if (n_past%100 == 0) {
653
+ // ggml_graph_print (&gf);
654
+ // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
655
+ //}
656
+
657
+ //embd_w.resize(n_vocab*N);
658
+ //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
659
+
660
+ // return result for just the last token
661
+ embd_w.resize(n_vocab);
662
+ memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
663
+
664
+ if (mem_per_token == 0) {
665
+ mem_per_token = ggml_used_mem(ctx0)/N;
666
+ }
667
+ //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
668
+
669
+ ggml_free(ctx0);
670
+
671
+ return true;
672
+ }
673
+
674
+ int main(int argc, char ** argv) {
675
+ ggml_time_init();
676
+
677
+ const int64_t t_main_start_us = ggml_time_us();
678
+
679
+ gpt_params params;
680
+ params.model = "models/stablelm-base-alpha-3b/ggml-model-f16.bin";
681
+
682
+ if (gpt_params_parse(argc, argv, params) == false) {
683
+ return 1;
684
+ }
685
+
686
+ if (params.seed < 0) {
687
+ params.seed = time(NULL);
688
+ }
689
+
690
+ printf("%s: seed = %d\n", __func__, params.seed);
691
+
692
+ std::mt19937 rng(params.seed);
693
+ if (params.prompt.empty()) {
694
+ params.prompt = gpt_random_prompt(rng);
695
+ }
696
+
697
+ int64_t t_load_us = 0;
698
+
699
+ gpt_vocab vocab;
700
+ gpt_neox_model model;
701
+
702
+ // load the model
703
+ {
704
+ const int64_t t_start_us = ggml_time_us();
705
+
706
+ if (!gpt_neox_model_load(params.model, model, vocab)) {
707
+ fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
708
+ return 1;
709
+ }
710
+
711
+ t_load_us = ggml_time_us() - t_start_us;
712
+
713
+ test_gpt_tokenizer(vocab, params.token_test);
714
+ }
715
+
716
+ int n_past = 0;
717
+
718
+ int64_t t_sample_us = 0;
719
+ int64_t t_predict_us = 0;
720
+
721
+ std::vector<float> logits;
722
+
723
+ // tokenize the prompt
724
+ std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
725
+
726
+ params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
727
+
728
+ printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
729
+ for (size_t i = 0; i < embd_inp.size(); i++) {
730
+ printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
731
+ }
732
+ printf("\n");
733
+
734
+ std::vector<gpt_vocab::id> embd;
735
+
736
+ // determine the required inference memory per token:
737
+ size_t mem_per_token = 0;
738
+ gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
739
+
740
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
741
+ // predict
742
+ if (embd.size() > 0) {
743
+ const int64_t t_start_us = ggml_time_us();
744
+
745
+ if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
746
+ printf("Failed to predict\n");
747
+ return 1;
748
+ }
749
+
750
+ t_predict_us += ggml_time_us() - t_start_us;
751
+ }
752
+
753
+ n_past += embd.size();
754
+ embd.clear();
755
+
756
+ if (i >= embd_inp.size()) {
757
+ // sample next token
758
+ const int top_k = params.top_k;
759
+ const float top_p = params.top_p;
760
+ const float temp = params.temp;
761
+
762
+ const int n_vocab = model.hparams.n_vocab;
763
+
764
+ gpt_vocab::id id = 0;
765
+
766
+ {
767
+ const int64_t t_start_sample_us = ggml_time_us();
768
+
769
+ id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
770
+
771
+ t_sample_us += ggml_time_us() - t_start_sample_us;
772
+ }
773
+
774
+ // add it to the context
775
+ embd.push_back(id);
776
+ } else {
777
+ // if here, it means we are still processing the input prompt
778
+ for (size_t k = i; k < embd_inp.size(); k++) {
779
+ embd.push_back(embd_inp[k]);
780
+ if (int32_t(embd.size()) > params.n_batch) {
781
+ break;
782
+ }
783
+ }
784
+ i += embd.size() - 1;
785
+ }
786
+
787
+ // display text
788
+ for (auto id : embd) {
789
+ printf("%s", vocab.id_to_token[id].c_str());
790
+ }
791
+ fflush(stdout);
792
+
793
+ // end of text token
794
+ if (embd.back() == 0) {
795
+ break;
796
+ }
797
+ }
798
+
799
+ // report timing
800
+ {
801
+ const int64_t t_main_end_us = ggml_time_us();
802
+
803
+ printf("\n\n");
804
+ printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
805
+ printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
806
+ printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
807
+ printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
808
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
809
+ }
810
+
811
+ ggml_free(model.ctx);
812
+
813
+ return 0;
814
+ }