llama-rb 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +85 -0
- data/LICENSE +21 -0
- data/README.md +81 -0
- data/Rakefile +10 -0
- data/ext/llama/common.cpp +311 -0
- data/ext/llama/common.h +95 -0
- data/ext/llama/extconf.rb +12 -0
- data/ext/llama/ggml.c +10642 -0
- data/ext/llama/ggml.h +778 -0
- data/ext/llama/llama.cpp +1815 -0
- data/ext/llama/llama.h +152 -0
- data/ext/llama/model.cpp +192 -0
- data/lib/llama/model.rb +86 -0
- data/lib/llama/version.rb +3 -0
- data/lib/llama.rb +6 -0
- data/llama-rb.gemspec +50 -0
- data/models/.gitkeep +0 -0
- metadata +80 -0
data/ext/llama/llama.cpp
ADDED
@@ -0,0 +1,1815 @@
|
|
1
|
+
#include "llama.h"
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
|
5
|
+
#include <cinttypes>
|
6
|
+
#include <fstream>
|
7
|
+
#include <random>
|
8
|
+
#include <map>
|
9
|
+
#include <unordered_map>
|
10
|
+
#include <queue>
|
11
|
+
#include <regex>
|
12
|
+
#include <cassert>
|
13
|
+
#include <cstring>
|
14
|
+
|
15
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
16
|
+
#define WIN32_LEAN_AND_MEAN
|
17
|
+
#include <Windows.h>
|
18
|
+
#else
|
19
|
+
#include <sys/types.h>
|
20
|
+
#include <sys/mman.h>
|
21
|
+
#include <unistd.h>
|
22
|
+
#include <fcntl.h>
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
|
26
|
+
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
|
27
|
+
|
28
|
+
#define LLAMA_USE_SCRATCH
|
29
|
+
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
30
|
+
|
31
|
+
#define LLAMA_ASSERT(x) \
|
32
|
+
do { \
|
33
|
+
if (!(x)) { \
|
34
|
+
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
35
|
+
abort(); \
|
36
|
+
} \
|
37
|
+
} while (0)
|
38
|
+
|
39
|
+
|
40
|
+
// determine number of model parts based on the dimension
|
41
|
+
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
42
|
+
{ 4096, 1 },
|
43
|
+
{ 5120, 2 },
|
44
|
+
{ 6656, 4 },
|
45
|
+
{ 8192, 8 },
|
46
|
+
};
|
47
|
+
|
48
|
+
// available llama models
|
49
|
+
enum e_model {
|
50
|
+
MODEL_UNKNOWN,
|
51
|
+
MODEL_7B,
|
52
|
+
MODEL_13B,
|
53
|
+
MODEL_30B,
|
54
|
+
MODEL_65B,
|
55
|
+
};
|
56
|
+
|
57
|
+
static const size_t MB = 1024*1024;
|
58
|
+
|
59
|
+
// computed for n_ctx == 2048
|
60
|
+
// TODO: dynamically determine these sizes
|
61
|
+
// needs modifications in ggml
|
62
|
+
|
63
|
+
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
|
64
|
+
{ MODEL_7B, 512ull*MB },
|
65
|
+
{ MODEL_13B, 512ull*MB },
|
66
|
+
{ MODEL_30B, 512ull*MB },
|
67
|
+
{ MODEL_65B, 512ull*MB },
|
68
|
+
};
|
69
|
+
|
70
|
+
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
|
71
|
+
{ MODEL_7B, 512ull*MB },
|
72
|
+
{ MODEL_13B, 512ull*MB },
|
73
|
+
{ MODEL_30B, 512ull*MB },
|
74
|
+
{ MODEL_65B, 512ull*MB },
|
75
|
+
};
|
76
|
+
|
77
|
+
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
78
|
+
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
|
79
|
+
{ MODEL_7B, 1026ull*MB },
|
80
|
+
{ MODEL_13B, 1608ull*MB },
|
81
|
+
{ MODEL_30B, 3124ull*MB },
|
82
|
+
{ MODEL_65B, 5120ull*MB },
|
83
|
+
};
|
84
|
+
|
85
|
+
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
86
|
+
// not actually needed if BLAS is disabled
|
87
|
+
static const std::map<e_model, size_t> MEM_REQ_EVAL = {
|
88
|
+
{ MODEL_7B, 768ull*MB },
|
89
|
+
{ MODEL_13B, 1024ull*MB },
|
90
|
+
{ MODEL_30B, 1280ull*MB },
|
91
|
+
{ MODEL_65B, 1536ull*MB },
|
92
|
+
};
|
93
|
+
|
94
|
+
// default hparams (LLaMA 7B)
|
95
|
+
struct llama_hparams {
|
96
|
+
int32_t n_vocab = 32000;
|
97
|
+
int32_t n_ctx = 512; // this is provided as user input?
|
98
|
+
int32_t n_embd = 4096;
|
99
|
+
int32_t n_mult = 256;
|
100
|
+
int32_t n_head = 32;
|
101
|
+
int32_t n_layer = 32;
|
102
|
+
int32_t n_rot = 64;
|
103
|
+
int32_t f16 = 1;
|
104
|
+
};
|
105
|
+
|
106
|
+
struct llama_layer {
|
107
|
+
// normalization
|
108
|
+
struct ggml_tensor * attention_norm;
|
109
|
+
|
110
|
+
// attention
|
111
|
+
struct ggml_tensor * wq;
|
112
|
+
struct ggml_tensor * wk;
|
113
|
+
struct ggml_tensor * wv;
|
114
|
+
struct ggml_tensor * wo;
|
115
|
+
|
116
|
+
// normalization
|
117
|
+
struct ggml_tensor * ffn_norm;
|
118
|
+
|
119
|
+
// ff
|
120
|
+
struct ggml_tensor * w1;
|
121
|
+
struct ggml_tensor * w2;
|
122
|
+
struct ggml_tensor * w3;
|
123
|
+
};
|
124
|
+
|
125
|
+
struct llama_kv_cache {
|
126
|
+
struct ggml_tensor * k;
|
127
|
+
struct ggml_tensor * v;
|
128
|
+
|
129
|
+
struct ggml_context * ctx;
|
130
|
+
|
131
|
+
std::vector<uint8_t> buf;
|
132
|
+
|
133
|
+
int n; // number of tokens currently in the cache
|
134
|
+
};
|
135
|
+
|
136
|
+
struct llama_model {
|
137
|
+
e_model type = MODEL_UNKNOWN;
|
138
|
+
|
139
|
+
llama_hparams hparams;
|
140
|
+
|
141
|
+
struct ggml_tensor * tok_embeddings;
|
142
|
+
|
143
|
+
struct ggml_tensor * norm;
|
144
|
+
struct ggml_tensor * output;
|
145
|
+
|
146
|
+
std::vector<llama_layer> layers;
|
147
|
+
|
148
|
+
// context
|
149
|
+
struct ggml_context * ctx;
|
150
|
+
|
151
|
+
// key + value cache for the self attention
|
152
|
+
// TODO: move to llama_state
|
153
|
+
struct llama_kv_cache kv_self;
|
154
|
+
|
155
|
+
// the model memory buffer
|
156
|
+
std::vector<uint8_t> buf;
|
157
|
+
|
158
|
+
// model memory mapped file
|
159
|
+
void * mm_addr = NULL;
|
160
|
+
uint64_t mm_length = 0;
|
161
|
+
|
162
|
+
// tensors
|
163
|
+
int n_loaded;
|
164
|
+
std::unordered_map<std::string, struct ggml_tensor *> tensors;
|
165
|
+
};
|
166
|
+
|
167
|
+
struct llama_vocab {
|
168
|
+
using id = int32_t;
|
169
|
+
using token = std::string;
|
170
|
+
|
171
|
+
struct token_score {
|
172
|
+
token tok;
|
173
|
+
float score;
|
174
|
+
};
|
175
|
+
|
176
|
+
std::unordered_map<token, id> token_to_id;
|
177
|
+
std::vector<token_score> id_to_token;
|
178
|
+
};
|
179
|
+
|
180
|
+
struct llama_context {
|
181
|
+
std::mt19937 rng;
|
182
|
+
|
183
|
+
int64_t t_load_us = 0;
|
184
|
+
int64_t t_start_us = 0;
|
185
|
+
bool has_evaluated_once = false;
|
186
|
+
|
187
|
+
int64_t t_sample_us = 0;
|
188
|
+
int64_t t_eval_us = 0;
|
189
|
+
int64_t t_p_eval_us = 0;
|
190
|
+
|
191
|
+
int32_t n_sample = 0; // number of tokens sampled
|
192
|
+
int32_t n_eval = 0; // number of eval calls
|
193
|
+
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
194
|
+
|
195
|
+
llama_model model;
|
196
|
+
llama_vocab vocab;
|
197
|
+
|
198
|
+
size_t mem_per_token = 0;
|
199
|
+
|
200
|
+
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
201
|
+
std::vector<float> logits;
|
202
|
+
bool logits_all = false;
|
203
|
+
|
204
|
+
// input embedding (1-dimensional array: [n_embd])
|
205
|
+
std::vector<float> embedding;
|
206
|
+
|
207
|
+
// memory buffers used to evaluate the model
|
208
|
+
// TODO: move in llama_state
|
209
|
+
std::vector<uint8_t> buf_compute;
|
210
|
+
std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
211
|
+
|
212
|
+
int buf_last = 0;
|
213
|
+
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
214
|
+
|
215
|
+
void use_buf(struct ggml_context * ctx, int i) {
|
216
|
+
#if defined(LLAMA_USE_SCRATCH)
|
217
|
+
size_t last_size = 0;
|
218
|
+
|
219
|
+
if (i == -1) {
|
220
|
+
last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
221
|
+
} else {
|
222
|
+
auto & buf = buf_scratch[i];
|
223
|
+
last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
|
224
|
+
}
|
225
|
+
|
226
|
+
if (buf_last >= 0) {
|
227
|
+
buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
|
228
|
+
}
|
229
|
+
|
230
|
+
buf_last = i;
|
231
|
+
#else
|
232
|
+
(void) i;
|
233
|
+
(void) ctx;
|
234
|
+
#endif
|
235
|
+
}
|
236
|
+
|
237
|
+
size_t get_buf_max_mem(int i) const {
|
238
|
+
#if defined(LLAMA_USE_SCRATCH)
|
239
|
+
return buf_max_size[i];
|
240
|
+
#else
|
241
|
+
(void) i;
|
242
|
+
return 0;
|
243
|
+
#endif
|
244
|
+
}
|
245
|
+
};
|
246
|
+
|
247
|
+
//
|
248
|
+
// kv cache
|
249
|
+
//
|
250
|
+
|
251
|
+
static bool kv_cache_init(
|
252
|
+
const struct llama_hparams & hparams,
|
253
|
+
struct llama_kv_cache & cache,
|
254
|
+
ggml_type wtype,
|
255
|
+
int n_ctx) {
|
256
|
+
const int n_embd = hparams.n_embd;
|
257
|
+
const int n_layer = hparams.n_layer;
|
258
|
+
|
259
|
+
const int n_mem = n_layer*n_ctx;
|
260
|
+
const int n_elements = n_embd*n_mem;
|
261
|
+
|
262
|
+
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
263
|
+
|
264
|
+
struct ggml_init_params params;
|
265
|
+
params.mem_size = cache.buf.size();
|
266
|
+
params.mem_buffer = cache.buf.data();
|
267
|
+
params.no_alloc = false;
|
268
|
+
|
269
|
+
cache.ctx = ggml_init(params);
|
270
|
+
|
271
|
+
if (!cache.ctx) {
|
272
|
+
fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
|
273
|
+
return false;
|
274
|
+
}
|
275
|
+
|
276
|
+
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
277
|
+
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
278
|
+
|
279
|
+
return true;
|
280
|
+
}
|
281
|
+
|
282
|
+
static void kv_cache_free(struct llama_kv_cache & cache) {
|
283
|
+
if (cache.ctx) {
|
284
|
+
ggml_free(cache.ctx);
|
285
|
+
cache.ctx = nullptr;
|
286
|
+
}
|
287
|
+
}
|
288
|
+
|
289
|
+
struct llama_context_params llama_context_default_params() {
|
290
|
+
struct llama_context_params result = {
|
291
|
+
/*.n_ctx =*/ 512,
|
292
|
+
/*.n_parts =*/ -1,
|
293
|
+
/*.seed =*/ 0,
|
294
|
+
/*.f16_kv =*/ false,
|
295
|
+
/*.logits_all =*/ false,
|
296
|
+
/*.vocab_only =*/ false,
|
297
|
+
/*.use_mlock =*/ false,
|
298
|
+
/*.embedding =*/ false,
|
299
|
+
/*.progress_callback =*/ nullptr,
|
300
|
+
/*.progress_callback_user_data =*/ nullptr,
|
301
|
+
};
|
302
|
+
|
303
|
+
return result;
|
304
|
+
}
|
305
|
+
|
306
|
+
//
|
307
|
+
// model loading
|
308
|
+
//
|
309
|
+
|
310
|
+
static void *mmap_file(const char *fname, uint64_t *mm_length) {
|
311
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
312
|
+
HANDLE hFile = CreateFileA(fname,
|
313
|
+
GENERIC_READ,
|
314
|
+
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
|
315
|
+
NULL,
|
316
|
+
OPEN_EXISTING,
|
317
|
+
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
|
318
|
+
NULL);
|
319
|
+
if (hFile == INVALID_HANDLE_VALUE) return 0;
|
320
|
+
LARGE_INTEGER fileSize;
|
321
|
+
fileSize.QuadPart = -1;
|
322
|
+
GetFileSizeEx(hFile, &fileSize);
|
323
|
+
int64_t length = fileSize.QuadPart;
|
324
|
+
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
325
|
+
CloseHandle(hFile);
|
326
|
+
if (!hMapping) return 0;
|
327
|
+
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
328
|
+
CloseHandle(hMapping);
|
329
|
+
if (!addr) return 0;
|
330
|
+
#else
|
331
|
+
int fd = open(fname, O_RDONLY);
|
332
|
+
if (fd == -1) return 0;
|
333
|
+
int64_t length = lseek(fd, 0, SEEK_END);
|
334
|
+
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
|
335
|
+
close(fd);
|
336
|
+
if (addr == MAP_FAILED) return 0;
|
337
|
+
#endif
|
338
|
+
*mm_length = length;
|
339
|
+
return addr;
|
340
|
+
}
|
341
|
+
|
342
|
+
static void munmap_file(void * addr, size_t length) {
|
343
|
+
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
|
344
|
+
UnmapViewOfFile(addr);
|
345
|
+
#else
|
346
|
+
munmap(addr, length);
|
347
|
+
#endif
|
348
|
+
}
|
349
|
+
|
350
|
+
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
|
351
|
+
fprintf(stderr,
|
352
|
+
"%s: invalid model file (bad magic [got %#x want %#x])\n"
|
353
|
+
"\tyou most likely need to regenerate your ggml files\n"
|
354
|
+
"\tthe benefit is you'll get 10-100x faster load times\n"
|
355
|
+
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
|
356
|
+
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
|
357
|
+
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
|
358
|
+
path, got, want);
|
359
|
+
return false;
|
360
|
+
}
|
361
|
+
|
362
|
+
static bool llama_model_load(
|
363
|
+
const std::string & fname,
|
364
|
+
llama_context & lctx,
|
365
|
+
int n_ctx,
|
366
|
+
int n_parts,
|
367
|
+
ggml_type memory_type,
|
368
|
+
bool vocab_only,
|
369
|
+
llama_progress_callback progress_callback,
|
370
|
+
void *progress_callback_user_data) {
|
371
|
+
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
372
|
+
|
373
|
+
lctx.t_start_us = ggml_time_us();
|
374
|
+
|
375
|
+
auto & model = lctx.model;
|
376
|
+
auto & vocab = lctx.vocab;
|
377
|
+
|
378
|
+
auto fin = std::ifstream(fname, std::ios::binary);
|
379
|
+
if (!fin) {
|
380
|
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
381
|
+
return false;
|
382
|
+
}
|
383
|
+
|
384
|
+
std::vector<char> f_buf(1024*1024);
|
385
|
+
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
386
|
+
|
387
|
+
fin.seekg(0, fin.end);
|
388
|
+
const size_t file_size = fin.tellg();
|
389
|
+
fin.seekg(0);
|
390
|
+
|
391
|
+
// verify magic
|
392
|
+
{
|
393
|
+
uint32_t magic;
|
394
|
+
fin.read((char *) &magic, sizeof(magic));
|
395
|
+
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
|
396
|
+
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
|
397
|
+
__func__, fname.c_str());
|
398
|
+
return false;
|
399
|
+
}
|
400
|
+
if (magic != LLAMA_FILE_MAGIC) {
|
401
|
+
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
402
|
+
}
|
403
|
+
|
404
|
+
uint32_t format_version;
|
405
|
+
fin.read((char *) &format_version, sizeof(format_version));
|
406
|
+
|
407
|
+
if (format_version != LLAMA_FILE_VERSION) {
|
408
|
+
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
409
|
+
__func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
|
410
|
+
return false;
|
411
|
+
}
|
412
|
+
}
|
413
|
+
|
414
|
+
int n_ff = 0;
|
415
|
+
|
416
|
+
// load hparams
|
417
|
+
{
|
418
|
+
auto & hparams = model.hparams;
|
419
|
+
|
420
|
+
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
421
|
+
//fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
422
|
+
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
423
|
+
fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
424
|
+
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
425
|
+
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
426
|
+
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
427
|
+
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
428
|
+
|
429
|
+
hparams.n_ctx = n_ctx;
|
430
|
+
|
431
|
+
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
432
|
+
|
433
|
+
if (n_parts < 1) {
|
434
|
+
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
435
|
+
}
|
436
|
+
|
437
|
+
// temp warning to tell the user to use "--n_parts"
|
438
|
+
if (hparams.f16 == 4 && n_parts != 1) {
|
439
|
+
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
|
440
|
+
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
|
441
|
+
}
|
442
|
+
|
443
|
+
if (hparams.n_layer == 32) {
|
444
|
+
model.type = e_model::MODEL_7B;
|
445
|
+
}
|
446
|
+
|
447
|
+
if (hparams.n_layer == 40) {
|
448
|
+
model.type = e_model::MODEL_13B;
|
449
|
+
}
|
450
|
+
|
451
|
+
if (hparams.n_layer == 60) {
|
452
|
+
model.type = e_model::MODEL_30B;
|
453
|
+
}
|
454
|
+
|
455
|
+
if (hparams.n_layer == 80) {
|
456
|
+
model.type = e_model::MODEL_65B;
|
457
|
+
}
|
458
|
+
|
459
|
+
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
460
|
+
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
461
|
+
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
462
|
+
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
|
463
|
+
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
|
464
|
+
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
|
465
|
+
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
|
466
|
+
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
467
|
+
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
468
|
+
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
469
|
+
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
470
|
+
}
|
471
|
+
|
472
|
+
// load vocab
|
473
|
+
{
|
474
|
+
std::string word;
|
475
|
+
vocab.id_to_token.resize(model.hparams.n_vocab);
|
476
|
+
std::vector<char> tmp(64);
|
477
|
+
|
478
|
+
for (int i = 0; i < model.hparams.n_vocab; i++) {
|
479
|
+
uint32_t len;
|
480
|
+
fin.read((char *) &len, sizeof(len));
|
481
|
+
|
482
|
+
word.resize(len);
|
483
|
+
if (len > 0) {
|
484
|
+
tmp.resize(len);
|
485
|
+
fin.read(tmp.data(), len);
|
486
|
+
word.assign(tmp.data(), len);
|
487
|
+
} else {
|
488
|
+
word.clear();
|
489
|
+
}
|
490
|
+
|
491
|
+
float score;
|
492
|
+
fin.read((char *) &score, sizeof(score));
|
493
|
+
|
494
|
+
vocab.token_to_id[word] = i;
|
495
|
+
|
496
|
+
auto &tok_score = vocab.id_to_token[i];
|
497
|
+
tok_score.tok = word;
|
498
|
+
tok_score.score = score;
|
499
|
+
}
|
500
|
+
}
|
501
|
+
|
502
|
+
if (vocab_only) {
|
503
|
+
return true;
|
504
|
+
}
|
505
|
+
|
506
|
+
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
507
|
+
// in order to save memory and also to speed up the computation
|
508
|
+
// wtype is for per-layer weights, while vtype is for other weights
|
509
|
+
ggml_type wtype, vtype;
|
510
|
+
switch (model.hparams.f16) {
|
511
|
+
case 0: wtype = vtype = GGML_TYPE_F32; break;
|
512
|
+
case 1: wtype = vtype = GGML_TYPE_F16; break;
|
513
|
+
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
|
514
|
+
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
|
515
|
+
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
|
516
|
+
default:
|
517
|
+
{
|
518
|
+
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
519
|
+
__func__, fname.c_str(), model.hparams.f16);
|
520
|
+
return false;
|
521
|
+
}
|
522
|
+
}
|
523
|
+
|
524
|
+
// map model into memory
|
525
|
+
char *mm_addr = NULL;
|
526
|
+
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
|
527
|
+
if (model.mm_addr == NULL) {
|
528
|
+
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
|
529
|
+
return false;
|
530
|
+
}
|
531
|
+
mm_addr = (char *)model.mm_addr;
|
532
|
+
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
|
533
|
+
|
534
|
+
auto & ctx = model.ctx;
|
535
|
+
|
536
|
+
size_t ctx_size = 0;
|
537
|
+
{
|
538
|
+
const auto &hparams = model.hparams;
|
539
|
+
const int n_layer = hparams.n_layer;
|
540
|
+
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
541
|
+
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
542
|
+
}
|
543
|
+
|
544
|
+
// print memory requirements
|
545
|
+
{
|
546
|
+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
547
|
+
|
548
|
+
// this is the total memory required to run the inference
|
549
|
+
const size_t mem_required =
|
550
|
+
ctx_size +
|
551
|
+
model.mm_length +
|
552
|
+
MEM_REQ_SCRATCH0.at(model.type) +
|
553
|
+
MEM_REQ_SCRATCH1.at(model.type) +
|
554
|
+
MEM_REQ_EVAL.at (model.type);
|
555
|
+
|
556
|
+
// this is the memory required by one llama_state
|
557
|
+
const size_t mem_required_state =
|
558
|
+
scale*MEM_REQ_KV_SELF.at(model.type);
|
559
|
+
|
560
|
+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
561
|
+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
562
|
+
}
|
563
|
+
|
564
|
+
// create the ggml context
|
565
|
+
{
|
566
|
+
lctx.model.buf.resize(ctx_size);
|
567
|
+
|
568
|
+
struct ggml_init_params params = {
|
569
|
+
/*.mem_size =*/ lctx.model.buf.size(),
|
570
|
+
/*.mem_buffer =*/ lctx.model.buf.data(),
|
571
|
+
/*.no_alloc =*/ true,
|
572
|
+
};
|
573
|
+
|
574
|
+
model.ctx = ggml_init(params);
|
575
|
+
if (!model.ctx) {
|
576
|
+
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
577
|
+
return false;
|
578
|
+
}
|
579
|
+
}
|
580
|
+
|
581
|
+
// prepare memory for the weights
|
582
|
+
{
|
583
|
+
const auto & hparams = model.hparams;
|
584
|
+
|
585
|
+
const int n_embd = hparams.n_embd;
|
586
|
+
const int n_layer = hparams.n_layer;
|
587
|
+
const int n_vocab = hparams.n_vocab;
|
588
|
+
|
589
|
+
model.layers.resize(n_layer);
|
590
|
+
|
591
|
+
model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
592
|
+
|
593
|
+
model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
594
|
+
model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
|
595
|
+
|
596
|
+
// map by name
|
597
|
+
model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
|
598
|
+
|
599
|
+
model.tensors["norm.weight"] = model.norm;
|
600
|
+
model.tensors["output.weight"] = model.output;
|
601
|
+
|
602
|
+
for (int i = 0; i < n_layer; ++i) {
|
603
|
+
auto & layer = model.layers[i];
|
604
|
+
|
605
|
+
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
606
|
+
|
607
|
+
layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
608
|
+
layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
609
|
+
layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
610
|
+
layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
611
|
+
|
612
|
+
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
613
|
+
|
614
|
+
layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
615
|
+
layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
|
616
|
+
layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
617
|
+
|
618
|
+
// map by name
|
619
|
+
model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
|
620
|
+
|
621
|
+
model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
|
622
|
+
model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
|
623
|
+
model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
|
624
|
+
model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
|
625
|
+
|
626
|
+
model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
|
627
|
+
|
628
|
+
model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
|
629
|
+
model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
|
630
|
+
model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
|
631
|
+
}
|
632
|
+
}
|
633
|
+
|
634
|
+
std::vector<uint8_t> tmp;
|
635
|
+
|
636
|
+
if (progress_callback) {
|
637
|
+
progress_callback(0.0, progress_callback_user_data);
|
638
|
+
}
|
639
|
+
|
640
|
+
fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
|
641
|
+
|
642
|
+
// load weights
|
643
|
+
{
|
644
|
+
size_t total_size = 0;
|
645
|
+
model.n_loaded = 0;
|
646
|
+
|
647
|
+
while (true) {
|
648
|
+
int32_t n_dims;
|
649
|
+
int32_t length;
|
650
|
+
int32_t ftype;
|
651
|
+
|
652
|
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
653
|
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
654
|
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
655
|
+
|
656
|
+
if (fin.eof()) {
|
657
|
+
break;
|
658
|
+
}
|
659
|
+
|
660
|
+
int32_t nelements = 1;
|
661
|
+
int32_t ne[2] = { 1, 1 };
|
662
|
+
for (int i = 0; i < n_dims; ++i) {
|
663
|
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
664
|
+
nelements *= ne[i];
|
665
|
+
}
|
666
|
+
|
667
|
+
std::string name(length, 0);
|
668
|
+
fin.read(&name[0], length);
|
669
|
+
|
670
|
+
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
671
|
+
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
672
|
+
return false;
|
673
|
+
}
|
674
|
+
|
675
|
+
auto tensor = model.tensors[name.data()];
|
676
|
+
|
677
|
+
if (ggml_nelements(tensor) != nelements) {
|
678
|
+
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
679
|
+
return false;
|
680
|
+
}
|
681
|
+
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
682
|
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
683
|
+
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
684
|
+
return false;
|
685
|
+
}
|
686
|
+
if (0) {
|
687
|
+
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
688
|
+
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
689
|
+
}
|
690
|
+
|
691
|
+
switch (ftype) {
|
692
|
+
case 0: // f32
|
693
|
+
case 1: // f16
|
694
|
+
break;
|
695
|
+
case 2: // q4_0
|
696
|
+
case 3: // q4_1
|
697
|
+
assert(ne[0] % 64 == 0);
|
698
|
+
break;
|
699
|
+
default:
|
700
|
+
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
701
|
+
return false;
|
702
|
+
};
|
703
|
+
|
704
|
+
// load the tensor data into memory without copying or reading it
|
705
|
+
size_t offset = fin.tellg();
|
706
|
+
size_t tensor_data_size = ggml_nbytes(tensor);
|
707
|
+
offset = (offset + 31) & -32;
|
708
|
+
tensor->data = mm_addr + offset;
|
709
|
+
fin.seekg(offset + tensor_data_size);
|
710
|
+
total_size += tensor_data_size;
|
711
|
+
model.n_loaded++;
|
712
|
+
|
713
|
+
// progress
|
714
|
+
if (progress_callback) {
|
715
|
+
double current_progress = size_t(fin.tellg()) / double(file_size);
|
716
|
+
progress_callback(current_progress, progress_callback_user_data);
|
717
|
+
}
|
718
|
+
}
|
719
|
+
|
720
|
+
fin.close();
|
721
|
+
|
722
|
+
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
723
|
+
if (model.n_loaded == 0) {
|
724
|
+
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
725
|
+
} else if (model.n_loaded != (int) model.tensors.size()) {
|
726
|
+
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
727
|
+
return false;
|
728
|
+
}
|
729
|
+
}
|
730
|
+
|
731
|
+
// loading time will be recalculate after the first eval, so
|
732
|
+
// we take page faults deferred by mmap() into consideration
|
733
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
734
|
+
|
735
|
+
if (progress_callback) {
|
736
|
+
progress_callback(1.0, progress_callback_user_data);
|
737
|
+
}
|
738
|
+
|
739
|
+
return true;
|
740
|
+
}
|
741
|
+
|
742
|
+
// evaluate the transformer
|
743
|
+
//
|
744
|
+
// - lctx: llama context
|
745
|
+
// - tokens: new batch of tokens to process
|
746
|
+
// - n_past: the context size so far
|
747
|
+
// - n_threads: number of threads to use
|
748
|
+
//
|
749
|
+
static bool llama_eval_internal(
|
750
|
+
llama_context & lctx,
|
751
|
+
const llama_token * tokens,
|
752
|
+
const int n_tokens,
|
753
|
+
const int n_past,
|
754
|
+
const int n_threads) {
|
755
|
+
const int64_t t_start_us = ggml_time_us();
|
756
|
+
|
757
|
+
const int N = n_tokens;
|
758
|
+
|
759
|
+
const auto & model = lctx.model;
|
760
|
+
const auto & hparams = model.hparams;
|
761
|
+
|
762
|
+
auto & kv_self = model.kv_self;
|
763
|
+
|
764
|
+
LLAMA_ASSERT(!!kv_self.ctx);
|
765
|
+
|
766
|
+
const int n_embd = hparams.n_embd;
|
767
|
+
const int n_layer = hparams.n_layer;
|
768
|
+
const int n_ctx = hparams.n_ctx;
|
769
|
+
const int n_head = hparams.n_head;
|
770
|
+
const int n_vocab = hparams.n_vocab;
|
771
|
+
const int n_rot = hparams.n_embd/hparams.n_head;
|
772
|
+
|
773
|
+
auto & mem_per_token = lctx.mem_per_token;
|
774
|
+
auto & buf_compute = lctx.buf_compute;
|
775
|
+
|
776
|
+
struct ggml_init_params params = {
|
777
|
+
/*.mem_size =*/ buf_compute.size(),
|
778
|
+
/*.mem_buffer =*/ buf_compute.data(),
|
779
|
+
/*.no_alloc =*/ false,
|
780
|
+
};
|
781
|
+
|
782
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
783
|
+
|
784
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
785
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
786
|
+
ggml_cgraph gf = {};
|
787
|
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
788
|
+
|
789
|
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
790
|
+
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
791
|
+
|
792
|
+
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
793
|
+
|
794
|
+
for (int il = 0; il < n_layer; ++il) {
|
795
|
+
struct ggml_tensor * inpSA = inpL;
|
796
|
+
|
797
|
+
struct ggml_tensor * cur;
|
798
|
+
|
799
|
+
lctx.use_buf(ctx0, 0);
|
800
|
+
|
801
|
+
// norm
|
802
|
+
{
|
803
|
+
cur = ggml_rms_norm(ctx0, inpL);
|
804
|
+
|
805
|
+
// cur = attention_norm*cur
|
806
|
+
cur = ggml_mul(ctx0,
|
807
|
+
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
808
|
+
cur);
|
809
|
+
}
|
810
|
+
|
811
|
+
// self-attention
|
812
|
+
{
|
813
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
814
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
815
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
816
|
+
|
817
|
+
// store key and value to memory
|
818
|
+
if (N >= 1) {
|
819
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
820
|
+
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
|
821
|
+
|
822
|
+
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
823
|
+
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
824
|
+
}
|
825
|
+
|
826
|
+
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
827
|
+
struct ggml_tensor * Q =
|
828
|
+
ggml_permute(ctx0,
|
829
|
+
ggml_rope(ctx0,
|
830
|
+
ggml_cpy(ctx0,
|
831
|
+
Qcur,
|
832
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
833
|
+
n_past, n_rot, 0),
|
834
|
+
0, 2, 1, 3);
|
835
|
+
|
836
|
+
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
837
|
+
struct ggml_tensor * K =
|
838
|
+
ggml_permute(ctx0,
|
839
|
+
ggml_rope(ctx0,
|
840
|
+
ggml_reshape_3d(ctx0,
|
841
|
+
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
842
|
+
n_embd/n_head, n_head, n_past + N),
|
843
|
+
n_past, n_rot, 1),
|
844
|
+
0, 2, 1, 3);
|
845
|
+
|
846
|
+
// K * Q
|
847
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
848
|
+
|
849
|
+
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
850
|
+
struct ggml_tensor * KQ_scaled =
|
851
|
+
ggml_scale(ctx0,
|
852
|
+
KQ,
|
853
|
+
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
854
|
+
|
855
|
+
// KQ_masked = mask_past(KQ_scaled)
|
856
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
857
|
+
|
858
|
+
// KQ = soft_max(KQ_masked)
|
859
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
860
|
+
|
861
|
+
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
862
|
+
struct ggml_tensor * V_trans =
|
863
|
+
ggml_cpy(ctx0,
|
864
|
+
ggml_permute(ctx0,
|
865
|
+
ggml_reshape_3d(ctx0,
|
866
|
+
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
|
867
|
+
n_embd/n_head, n_head, n_past + N),
|
868
|
+
1, 2, 0, 3),
|
869
|
+
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
870
|
+
|
871
|
+
// KQV = transpose(V) * KQ_soft_max
|
872
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
873
|
+
|
874
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
875
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
876
|
+
|
877
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
878
|
+
cur = ggml_cpy(ctx0,
|
879
|
+
KQV_merged,
|
880
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
881
|
+
|
882
|
+
// projection (no bias)
|
883
|
+
cur = ggml_mul_mat(ctx0,
|
884
|
+
model.layers[il].wo,
|
885
|
+
cur);
|
886
|
+
}
|
887
|
+
|
888
|
+
lctx.use_buf(ctx0, 1);
|
889
|
+
|
890
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
891
|
+
|
892
|
+
// feed-forward network
|
893
|
+
{
|
894
|
+
// norm
|
895
|
+
{
|
896
|
+
cur = ggml_rms_norm(ctx0, inpFF);
|
897
|
+
|
898
|
+
// cur = ffn_norm*cur
|
899
|
+
cur = ggml_mul(ctx0,
|
900
|
+
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
901
|
+
cur);
|
902
|
+
}
|
903
|
+
|
904
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
905
|
+
model.layers[il].w3,
|
906
|
+
cur);
|
907
|
+
|
908
|
+
cur = ggml_mul_mat(ctx0,
|
909
|
+
model.layers[il].w1,
|
910
|
+
cur);
|
911
|
+
|
912
|
+
// SILU activation
|
913
|
+
cur = ggml_silu(ctx0, cur);
|
914
|
+
|
915
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
916
|
+
|
917
|
+
cur = ggml_mul_mat(ctx0,
|
918
|
+
model.layers[il].w2,
|
919
|
+
cur);
|
920
|
+
}
|
921
|
+
|
922
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
923
|
+
|
924
|
+
// input for next layer
|
925
|
+
inpL = cur;
|
926
|
+
}
|
927
|
+
|
928
|
+
lctx.use_buf(ctx0, 0);
|
929
|
+
|
930
|
+
// used at the end to optionally extract the embeddings
|
931
|
+
struct ggml_tensor * embeddings = NULL;
|
932
|
+
|
933
|
+
// norm
|
934
|
+
{
|
935
|
+
|
936
|
+
inpL = ggml_rms_norm(ctx0, inpL);
|
937
|
+
|
938
|
+
// inpL = norm*inpL
|
939
|
+
inpL = ggml_mul(ctx0,
|
940
|
+
ggml_repeat(ctx0, model.norm, inpL),
|
941
|
+
inpL);
|
942
|
+
|
943
|
+
embeddings = inpL;
|
944
|
+
}
|
945
|
+
|
946
|
+
// lm_head
|
947
|
+
inpL = ggml_mul_mat(ctx0, model.output, inpL);
|
948
|
+
|
949
|
+
lctx.use_buf(ctx0, -1);
|
950
|
+
|
951
|
+
// logits -> probs
|
952
|
+
//inpL = ggml_soft_max(ctx0, inpL);
|
953
|
+
|
954
|
+
// run the computation
|
955
|
+
ggml_build_forward_expand(&gf, inpL);
|
956
|
+
ggml_graph_compute (ctx0, &gf);
|
957
|
+
|
958
|
+
//if (n_past%100 == 0) {
|
959
|
+
// ggml_graph_print (&gf);
|
960
|
+
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
961
|
+
//}
|
962
|
+
|
963
|
+
//embd_w.resize(n_vocab*N);
|
964
|
+
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
965
|
+
|
966
|
+
// extract logits
|
967
|
+
{
|
968
|
+
auto & logits_out = lctx.logits;
|
969
|
+
|
970
|
+
if (lctx.logits_all) {
|
971
|
+
logits_out.resize(n_vocab * N);
|
972
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
973
|
+
} else {
|
974
|
+
// return result for just the last token
|
975
|
+
logits_out.resize(n_vocab);
|
976
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
977
|
+
}
|
978
|
+
}
|
979
|
+
|
980
|
+
// extract embeddings
|
981
|
+
if (lctx.embedding.size()) {
|
982
|
+
auto & embedding_out = lctx.embedding;
|
983
|
+
|
984
|
+
embedding_out.resize(n_embd);
|
985
|
+
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
986
|
+
}
|
987
|
+
|
988
|
+
if (mem_per_token == 0) {
|
989
|
+
mem_per_token = ggml_used_mem(ctx0)/N;
|
990
|
+
}
|
991
|
+
|
992
|
+
#if 0
|
993
|
+
printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
|
994
|
+
ggml_used_mem(ctx0)/1024.0/1024.0,
|
995
|
+
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
996
|
+
lctx.get_buf_max_mem(1)/1024.0/1024.0);
|
997
|
+
#endif
|
998
|
+
|
999
|
+
ggml_free(ctx0);
|
1000
|
+
|
1001
|
+
// measure the performance only for the single-token evals
|
1002
|
+
if (N == 1) {
|
1003
|
+
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
1004
|
+
lctx.n_eval++;
|
1005
|
+
}
|
1006
|
+
else if (N > 1) {
|
1007
|
+
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
1008
|
+
lctx.n_p_eval += N;
|
1009
|
+
}
|
1010
|
+
|
1011
|
+
return true;
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
//
|
1015
|
+
// tokenizer
|
1016
|
+
//
|
1017
|
+
|
1018
|
+
static size_t utf8_len(char src) {
|
1019
|
+
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
1020
|
+
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
1021
|
+
return lookup[highbits];
|
1022
|
+
}
|
1023
|
+
|
1024
|
+
struct llama_sp_symbol {
|
1025
|
+
using index = int;
|
1026
|
+
index prev;
|
1027
|
+
index next;
|
1028
|
+
const char * text;
|
1029
|
+
size_t n;
|
1030
|
+
};
|
1031
|
+
|
1032
|
+
struct llama_sp_bigram {
|
1033
|
+
struct comparator {
|
1034
|
+
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
1035
|
+
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
|
1036
|
+
}
|
1037
|
+
};
|
1038
|
+
using queue_storage = std::vector<llama_sp_bigram>;
|
1039
|
+
using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
|
1040
|
+
llama_sp_symbol::index left;
|
1041
|
+
llama_sp_symbol::index right;
|
1042
|
+
float score;
|
1043
|
+
size_t size;
|
1044
|
+
};
|
1045
|
+
|
1046
|
+
// original implementation:
|
1047
|
+
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
1048
|
+
struct llama_tokenizer {
|
1049
|
+
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
1050
|
+
|
1051
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
1052
|
+
// split string into utf8 chars
|
1053
|
+
int index = 0;
|
1054
|
+
size_t offs = 0;
|
1055
|
+
while (offs < text.size()) {
|
1056
|
+
llama_sp_symbol sym;
|
1057
|
+
size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
|
1058
|
+
sym.text = text.c_str() + offs;
|
1059
|
+
sym.n = char_len;
|
1060
|
+
offs += char_len;
|
1061
|
+
sym.prev = index - 1;
|
1062
|
+
sym.next = offs == text.size() ? -1 : index + 1;
|
1063
|
+
index++;
|
1064
|
+
symbols_.emplace_back(std::move(sym));
|
1065
|
+
}
|
1066
|
+
|
1067
|
+
// seed the work queue with all possible 2-character tokens.
|
1068
|
+
for (size_t i = 1; i < symbols_.size(); ++i) {
|
1069
|
+
try_add_bigram(i - 1, i);
|
1070
|
+
}
|
1071
|
+
|
1072
|
+
// keep substituting the highest frequency pairs for as long as we can.
|
1073
|
+
while (!work_queue_.empty()) {
|
1074
|
+
auto bigram = work_queue_.top();
|
1075
|
+
work_queue_.pop();
|
1076
|
+
|
1077
|
+
auto & left_sym = symbols_[bigram.left];
|
1078
|
+
auto & right_sym = symbols_[bigram.right];
|
1079
|
+
|
1080
|
+
// if one of the symbols already got merged, skip it.
|
1081
|
+
if (left_sym.n == 0 || right_sym.n == 0 ||
|
1082
|
+
left_sym.n + right_sym.n != bigram.size) {
|
1083
|
+
continue;
|
1084
|
+
}
|
1085
|
+
|
1086
|
+
// merge the right sym into the left one
|
1087
|
+
left_sym.n += right_sym.n;
|
1088
|
+
right_sym.n = 0;
|
1089
|
+
|
1090
|
+
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
1091
|
+
|
1092
|
+
// remove the right sym from the chain
|
1093
|
+
left_sym.next = right_sym.next;
|
1094
|
+
if (right_sym.next >= 0) {
|
1095
|
+
symbols_[right_sym.next].prev = bigram.left;
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
// find more substitutions
|
1099
|
+
try_add_bigram(left_sym.prev, bigram.left);
|
1100
|
+
try_add_bigram(bigram.left, left_sym.next);
|
1101
|
+
}
|
1102
|
+
|
1103
|
+
for (int i = 0; i != -1; i = symbols_[i].next) {
|
1104
|
+
auto & symbol = symbols_[i];
|
1105
|
+
auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
|
1106
|
+
|
1107
|
+
if (token == vocab_.token_to_id.end()) {
|
1108
|
+
// output any symbols that did not form tokens as bytes.
|
1109
|
+
for (int j = 0; j < (int) symbol.n; ++j) {
|
1110
|
+
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
1111
|
+
output.push_back(token_id);
|
1112
|
+
}
|
1113
|
+
} else {
|
1114
|
+
output.push_back((*token).second);
|
1115
|
+
}
|
1116
|
+
}
|
1117
|
+
}
|
1118
|
+
|
1119
|
+
private:
|
1120
|
+
void try_add_bigram(int left, int right) {
|
1121
|
+
if (left == -1 || right == -1) {
|
1122
|
+
return;
|
1123
|
+
}
|
1124
|
+
|
1125
|
+
const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
|
1126
|
+
auto token = vocab_.token_to_id.find(text);
|
1127
|
+
|
1128
|
+
if (token == vocab_.token_to_id.end()) {
|
1129
|
+
return;
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
|
1133
|
+
return;
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
const auto &tok_score = vocab_.id_to_token[(*token).second];
|
1137
|
+
|
1138
|
+
llama_sp_bigram bigram;
|
1139
|
+
bigram.left = left;
|
1140
|
+
bigram.right = right;
|
1141
|
+
bigram.score = tok_score.score;
|
1142
|
+
bigram.size = text.size();
|
1143
|
+
work_queue_.push(bigram);
|
1144
|
+
}
|
1145
|
+
|
1146
|
+
const llama_vocab & vocab_;
|
1147
|
+
std::vector<llama_sp_symbol> symbols_;
|
1148
|
+
llama_sp_bigram::queue work_queue_;
|
1149
|
+
};
|
1150
|
+
|
1151
|
+
static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
|
1152
|
+
llama_tokenizer tokenizer(vocab);
|
1153
|
+
std::vector<llama_vocab::id> output;
|
1154
|
+
|
1155
|
+
if (text.size() == 0) {
|
1156
|
+
return output;
|
1157
|
+
}
|
1158
|
+
|
1159
|
+
if (bos) {
|
1160
|
+
output.push_back(1);
|
1161
|
+
}
|
1162
|
+
|
1163
|
+
tokenizer.tokenize(text, output);
|
1164
|
+
return output;
|
1165
|
+
}
|
1166
|
+
|
1167
|
+
//
|
1168
|
+
// sampling
|
1169
|
+
//
|
1170
|
+
|
1171
|
+
static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
|
1172
|
+
// find the top k tokens
|
1173
|
+
std::partial_sort(
|
1174
|
+
logits_id.begin(),
|
1175
|
+
logits_id.begin() + top_k, logits_id.end(),
|
1176
|
+
[](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
|
1177
|
+
return a.first > b.first;
|
1178
|
+
});
|
1179
|
+
|
1180
|
+
logits_id.resize(top_k);
|
1181
|
+
}
|
1182
|
+
|
1183
|
+
static llama_vocab::id llama_sample_top_p_top_k(
|
1184
|
+
llama_context & lctx,
|
1185
|
+
const std::vector<llama_vocab::id> & last_n_tokens,
|
1186
|
+
int top_k,
|
1187
|
+
float top_p,
|
1188
|
+
float temp,
|
1189
|
+
float repeat_penalty) {
|
1190
|
+
auto & rng = lctx.rng;
|
1191
|
+
|
1192
|
+
const int n_logits = lctx.model.hparams.n_vocab;
|
1193
|
+
|
1194
|
+
const auto & logits = lctx.logits;
|
1195
|
+
const auto * plogits = logits.data() + logits.size() - n_logits;
|
1196
|
+
|
1197
|
+
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
1198
|
+
logits_id.reserve(n_logits);
|
1199
|
+
|
1200
|
+
{
|
1201
|
+
const float scale = 1.0f/temp;
|
1202
|
+
for (int i = 0; i < n_logits; ++i) {
|
1203
|
+
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
1204
|
+
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
1205
|
+
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
|
1206
|
+
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
1207
|
+
if (plogits[i] < 0.0f) {
|
1208
|
+
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
1209
|
+
} else {
|
1210
|
+
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
1211
|
+
}
|
1212
|
+
} else {
|
1213
|
+
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
|
1214
|
+
}
|
1215
|
+
}
|
1216
|
+
}
|
1217
|
+
|
1218
|
+
sample_top_k(logits_id, top_k);
|
1219
|
+
|
1220
|
+
float maxl = -std::numeric_limits<float>::infinity();
|
1221
|
+
for (const auto & kv : logits_id) {
|
1222
|
+
maxl = Max(maxl, kv.first);
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
// compute probs for the top k tokens
|
1226
|
+
std::vector<float> probs;
|
1227
|
+
probs.reserve(logits_id.size());
|
1228
|
+
|
1229
|
+
double sum = 0.0;
|
1230
|
+
for (const auto & kv : logits_id) {
|
1231
|
+
const float p = expf(kv.first - maxl);
|
1232
|
+
probs.push_back(p);
|
1233
|
+
sum += p;
|
1234
|
+
}
|
1235
|
+
|
1236
|
+
// normalize the probs
|
1237
|
+
for (auto & p : probs) {
|
1238
|
+
p /= sum;
|
1239
|
+
}
|
1240
|
+
|
1241
|
+
if (top_p < 1.0) {
|
1242
|
+
double cumsum = 0.0;
|
1243
|
+
for (int i = 0; i < (int) probs.size(); i++) {
|
1244
|
+
cumsum += probs[i];
|
1245
|
+
if (cumsum >= top_p) {
|
1246
|
+
probs.resize(i + 1);
|
1247
|
+
logits_id.resize(i + 1);
|
1248
|
+
break;
|
1249
|
+
}
|
1250
|
+
}
|
1251
|
+
|
1252
|
+
cumsum = 1.0/cumsum;
|
1253
|
+
for (int i = 0; i < (int) probs.size(); i++) {
|
1254
|
+
probs[i] *= cumsum;
|
1255
|
+
}
|
1256
|
+
}
|
1257
|
+
|
1258
|
+
//printf("\n");
|
1259
|
+
//for (int i = 0; i < (int) 10; i++) {
|
1260
|
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
1261
|
+
//}
|
1262
|
+
//printf("\n\n");
|
1263
|
+
//exit(0);
|
1264
|
+
|
1265
|
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
1266
|
+
int idx = dist(rng);
|
1267
|
+
|
1268
|
+
return logits_id[idx].second;
|
1269
|
+
}
|
1270
|
+
|
1271
|
+
//
|
1272
|
+
// quantization
|
1273
|
+
//
|
1274
|
+
|
1275
|
+
// TODO: reuse code from the llama_model_load() somehow
|
1276
|
+
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
|
1277
|
+
ggml_type type = GGML_TYPE_Q4_1;
|
1278
|
+
|
1279
|
+
switch (itype) {
|
1280
|
+
case 2: type = GGML_TYPE_Q4_0; break;
|
1281
|
+
case 3: type = GGML_TYPE_Q4_1; break;
|
1282
|
+
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
|
1283
|
+
};
|
1284
|
+
|
1285
|
+
if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
|
1286
|
+
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
|
1287
|
+
return false;
|
1288
|
+
}
|
1289
|
+
|
1290
|
+
llama_vocab vocab;
|
1291
|
+
|
1292
|
+
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
1293
|
+
|
1294
|
+
auto finp = std::ifstream(fname_inp, std::ios::binary);
|
1295
|
+
if (!finp) {
|
1296
|
+
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
|
1297
|
+
return false;
|
1298
|
+
}
|
1299
|
+
|
1300
|
+
auto fout = std::ofstream(fname_out, std::ios::binary);
|
1301
|
+
if (!fout) {
|
1302
|
+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
|
1303
|
+
return false;
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
// verify magic
|
1307
|
+
{
|
1308
|
+
uint32_t magic;
|
1309
|
+
finp.read((char *) &magic, sizeof(magic));
|
1310
|
+
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
|
1311
|
+
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
1312
|
+
__func__, fname_inp.c_str());
|
1313
|
+
return false;
|
1314
|
+
}
|
1315
|
+
if (magic != LLAMA_FILE_MAGIC) {
|
1316
|
+
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
fout.write((char *) &magic, sizeof(magic));
|
1320
|
+
|
1321
|
+
uint32_t format_version;
|
1322
|
+
finp.read((char *) &format_version, sizeof(format_version));
|
1323
|
+
|
1324
|
+
if (format_version != LLAMA_FILE_VERSION) {
|
1325
|
+
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
1326
|
+
__func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
|
1327
|
+
return false;
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
fout.write((char *) &format_version, sizeof(format_version));
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
llama_hparams hparams;
|
1334
|
+
|
1335
|
+
// load hparams
|
1336
|
+
{
|
1337
|
+
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1338
|
+
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1339
|
+
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1340
|
+
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1341
|
+
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1342
|
+
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1343
|
+
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1344
|
+
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
1345
|
+
|
1346
|
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
1347
|
+
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
1348
|
+
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
1349
|
+
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
|
1350
|
+
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
1351
|
+
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
1352
|
+
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
1353
|
+
|
1354
|
+
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
1355
|
+
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
1356
|
+
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
1357
|
+
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
|
1358
|
+
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
1359
|
+
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
1360
|
+
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
1361
|
+
fout.write((char *) &itype, sizeof(hparams.f16));
|
1362
|
+
}
|
1363
|
+
|
1364
|
+
// load vocab
|
1365
|
+
{
|
1366
|
+
const int32_t n_vocab = hparams.n_vocab;
|
1367
|
+
|
1368
|
+
if (n_vocab != hparams.n_vocab) {
|
1369
|
+
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
1370
|
+
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
1371
|
+
return false;
|
1372
|
+
}
|
1373
|
+
|
1374
|
+
std::vector<char> word(32);
|
1375
|
+
vocab.id_to_token.resize(n_vocab);
|
1376
|
+
for (int i = 0; i < n_vocab; i++) {
|
1377
|
+
uint32_t len;
|
1378
|
+
finp.read ((char *) &len, sizeof(len));
|
1379
|
+
fout.write((char *) &len, sizeof(len));
|
1380
|
+
|
1381
|
+
word.resize(len);
|
1382
|
+
finp.read ((char *) &word[0], len);
|
1383
|
+
fout.write((char *) &word[0], len);
|
1384
|
+
|
1385
|
+
float score;
|
1386
|
+
finp.read ((char *) &score, sizeof(score));
|
1387
|
+
fout.write((char *) &score, sizeof(score));
|
1388
|
+
|
1389
|
+
vocab.token_to_id[word.data()] = i;
|
1390
|
+
|
1391
|
+
auto &tok_score = vocab.id_to_token[i];
|
1392
|
+
tok_score.tok = word.data();
|
1393
|
+
tok_score.score = score;
|
1394
|
+
}
|
1395
|
+
}
|
1396
|
+
|
1397
|
+
// load weights
|
1398
|
+
{
|
1399
|
+
size_t total_size_org = 0;
|
1400
|
+
size_t total_size_new = 0;
|
1401
|
+
|
1402
|
+
std::vector<float> work;
|
1403
|
+
|
1404
|
+
std::vector<uint8_t> data_u8;
|
1405
|
+
std::vector<ggml_fp16_t> data_f16;
|
1406
|
+
std::vector<float> data_f32;
|
1407
|
+
|
1408
|
+
std::vector<int64_t> hist_all(1 << 4, 0);
|
1409
|
+
|
1410
|
+
while (true) {
|
1411
|
+
int32_t n_dims;
|
1412
|
+
int32_t length;
|
1413
|
+
int32_t ftype;
|
1414
|
+
|
1415
|
+
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1416
|
+
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
1417
|
+
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1418
|
+
|
1419
|
+
if (finp.eof()) {
|
1420
|
+
break;
|
1421
|
+
}
|
1422
|
+
|
1423
|
+
int32_t nelements = 1;
|
1424
|
+
int32_t ne[2] = { 1, 1 };
|
1425
|
+
for (int i = 0; i < n_dims; ++i) {
|
1426
|
+
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1427
|
+
nelements *= ne[i];
|
1428
|
+
}
|
1429
|
+
|
1430
|
+
std::string name(length, 0);
|
1431
|
+
finp.read (&name[0], length);
|
1432
|
+
|
1433
|
+
{
|
1434
|
+
// ensure tensor data is aligned
|
1435
|
+
uint64_t offset = finp.tellg();
|
1436
|
+
offset = (offset + 31) & -32;
|
1437
|
+
finp.seekg(offset);
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
{
|
1441
|
+
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
1442
|
+
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
|
1443
|
+
}
|
1444
|
+
|
1445
|
+
// regexes of tensor names to be quantized
|
1446
|
+
const std::vector<std::string> k_names = {
|
1447
|
+
".*weight",
|
1448
|
+
};
|
1449
|
+
|
1450
|
+
bool quantize = false;
|
1451
|
+
for (const auto & s : k_names) {
|
1452
|
+
if (std::regex_match(name, std::regex(s))) {
|
1453
|
+
quantize = true;
|
1454
|
+
break;
|
1455
|
+
}
|
1456
|
+
}
|
1457
|
+
|
1458
|
+
// quantize only 2D tensors
|
1459
|
+
quantize &= (n_dims == 2);
|
1460
|
+
|
1461
|
+
if (quantize) {
|
1462
|
+
if (ftype != 0 && ftype != 1) {
|
1463
|
+
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
|
1464
|
+
return false;
|
1465
|
+
}
|
1466
|
+
|
1467
|
+
if (ftype == 1) {
|
1468
|
+
data_f16.resize(nelements);
|
1469
|
+
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
1470
|
+
data_f32.resize(nelements);
|
1471
|
+
for (int i = 0; i < nelements; ++i) {
|
1472
|
+
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
1473
|
+
}
|
1474
|
+
} else {
|
1475
|
+
data_f32.resize(nelements);
|
1476
|
+
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
1477
|
+
}
|
1478
|
+
|
1479
|
+
ftype = itype;
|
1480
|
+
} else {
|
1481
|
+
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
|
1482
|
+
|
1483
|
+
data_u8.resize(nelements*bpe);
|
1484
|
+
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
1485
|
+
}
|
1486
|
+
|
1487
|
+
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
1488
|
+
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
|
1489
|
+
fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
1490
|
+
for (int i = 0; i < n_dims; ++i) {
|
1491
|
+
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
1492
|
+
}
|
1493
|
+
fout.write(&name[0], length);
|
1494
|
+
|
1495
|
+
{
|
1496
|
+
// ensure tensor data is aligned
|
1497
|
+
uint64_t offset = fout.tellp();
|
1498
|
+
offset = (offset + 31) & -32;
|
1499
|
+
fout.seekp(offset);
|
1500
|
+
}
|
1501
|
+
|
1502
|
+
if (quantize) {
|
1503
|
+
printf("quantizing .. ");
|
1504
|
+
work.resize(nelements); // for quantization
|
1505
|
+
|
1506
|
+
size_t cur_size = 0;
|
1507
|
+
std::vector<int64_t> hist_cur(1 << 4, 0);
|
1508
|
+
|
1509
|
+
switch (type) {
|
1510
|
+
case GGML_TYPE_Q4_0:
|
1511
|
+
{
|
1512
|
+
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1513
|
+
} break;
|
1514
|
+
case GGML_TYPE_Q4_1:
|
1515
|
+
{
|
1516
|
+
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
1517
|
+
} break;
|
1518
|
+
default:
|
1519
|
+
{
|
1520
|
+
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
|
1521
|
+
return false;
|
1522
|
+
}
|
1523
|
+
}
|
1524
|
+
|
1525
|
+
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
1526
|
+
total_size_new += cur_size;
|
1527
|
+
|
1528
|
+
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
1529
|
+
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1530
|
+
hist_all[i] += hist_cur[i];
|
1531
|
+
}
|
1532
|
+
|
1533
|
+
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
1534
|
+
printf("%5.3f ", hist_cur[i] / float(nelements));
|
1535
|
+
}
|
1536
|
+
printf("\n");
|
1537
|
+
} else {
|
1538
|
+
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
1539
|
+
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
1540
|
+
total_size_new += data_u8.size();
|
1541
|
+
}
|
1542
|
+
|
1543
|
+
total_size_org += nelements * sizeof(float);
|
1544
|
+
}
|
1545
|
+
|
1546
|
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
1547
|
+
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
1548
|
+
|
1549
|
+
{
|
1550
|
+
int64_t sum_all = 0;
|
1551
|
+
for (int i = 0; i < (int) hist_all.size(); ++i) {
|
1552
|
+
sum_all += hist_all[i];
|
1553
|
+
}
|
1554
|
+
|
1555
|
+
printf("%s: hist: ", __func__);
|
1556
|
+
for (int i = 0; i < (int) hist_all.size(); ++i) {
|
1557
|
+
printf("%5.3f ", hist_all[i] / float(sum_all));
|
1558
|
+
}
|
1559
|
+
printf("\n");
|
1560
|
+
}
|
1561
|
+
}
|
1562
|
+
|
1563
|
+
finp.close();
|
1564
|
+
fout.close();
|
1565
|
+
|
1566
|
+
return true;
|
1567
|
+
}
|
1568
|
+
|
1569
|
+
//
|
1570
|
+
// interface implementation
|
1571
|
+
//
|
1572
|
+
|
1573
|
+
struct llama_context * llama_init_from_file(
|
1574
|
+
const char * path_model,
|
1575
|
+
struct llama_context_params params) {
|
1576
|
+
ggml_time_init();
|
1577
|
+
|
1578
|
+
llama_context * ctx = new llama_context;
|
1579
|
+
|
1580
|
+
if (params.seed <= 0) {
|
1581
|
+
params.seed = time(NULL);
|
1582
|
+
}
|
1583
|
+
|
1584
|
+
ctx->rng = std::mt19937(params.seed);
|
1585
|
+
ctx->logits_all = params.logits_all;
|
1586
|
+
|
1587
|
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
1588
|
+
|
1589
|
+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
|
1590
|
+
params.vocab_only, params.progress_callback,
|
1591
|
+
params.progress_callback_user_data)) {
|
1592
|
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
1593
|
+
llama_free(ctx);
|
1594
|
+
return nullptr;
|
1595
|
+
}
|
1596
|
+
|
1597
|
+
if (params.use_mlock) {
|
1598
|
+
char *err;
|
1599
|
+
if (!ggml_mlock(ctx->model.ctx,
|
1600
|
+
ctx->model.mm_addr,
|
1601
|
+
ctx->model.mm_length,
|
1602
|
+
&err)) {
|
1603
|
+
fprintf(stderr, "%s\n", err);
|
1604
|
+
free(err);
|
1605
|
+
llama_free(ctx);
|
1606
|
+
return nullptr;
|
1607
|
+
}
|
1608
|
+
}
|
1609
|
+
|
1610
|
+
// reserve memory for context buffers
|
1611
|
+
{
|
1612
|
+
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
1613
|
+
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
1614
|
+
llama_free(ctx);
|
1615
|
+
return nullptr;
|
1616
|
+
}
|
1617
|
+
|
1618
|
+
{
|
1619
|
+
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
|
1620
|
+
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
1621
|
+
}
|
1622
|
+
|
1623
|
+
const auto & hparams = ctx->model.hparams;
|
1624
|
+
|
1625
|
+
// resized during inference
|
1626
|
+
if (params.logits_all) {
|
1627
|
+
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
1628
|
+
} else {
|
1629
|
+
ctx->logits.reserve(hparams.n_ctx);
|
1630
|
+
}
|
1631
|
+
|
1632
|
+
if (params.embedding){
|
1633
|
+
ctx->embedding.resize(hparams.n_embd);
|
1634
|
+
}
|
1635
|
+
|
1636
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
1637
|
+
|
1638
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
1639
|
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
1640
|
+
}
|
1641
|
+
|
1642
|
+
return ctx;
|
1643
|
+
}
|
1644
|
+
|
1645
|
+
void llama_free(struct llama_context * ctx) {
|
1646
|
+
kv_cache_free(ctx->model.kv_self);
|
1647
|
+
|
1648
|
+
if (ctx->model.ctx) {
|
1649
|
+
ggml_free(ctx->model.ctx);
|
1650
|
+
}
|
1651
|
+
|
1652
|
+
if (ctx->model.mm_addr) {
|
1653
|
+
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
|
1654
|
+
}
|
1655
|
+
|
1656
|
+
delete ctx;
|
1657
|
+
}
|
1658
|
+
|
1659
|
+
int llama_model_quantize(
|
1660
|
+
const char * fname_inp,
|
1661
|
+
const char * fname_out,
|
1662
|
+
int itype) {
|
1663
|
+
if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
|
1664
|
+
fprintf(stderr, "%s: failed to quantize\n", __func__);
|
1665
|
+
return 1;
|
1666
|
+
}
|
1667
|
+
|
1668
|
+
return 0;
|
1669
|
+
}
|
1670
|
+
|
1671
|
+
int llama_eval(
|
1672
|
+
struct llama_context * ctx,
|
1673
|
+
const llama_token * tokens,
|
1674
|
+
int n_tokens,
|
1675
|
+
int n_past,
|
1676
|
+
int n_threads) {
|
1677
|
+
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
|
1678
|
+
fprintf(stderr, "%s: failed to eval\n", __func__);
|
1679
|
+
return 1;
|
1680
|
+
}
|
1681
|
+
// get a more accurate load time, upon first eval
|
1682
|
+
if (!ctx->has_evaluated_once) {
|
1683
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
1684
|
+
ctx->has_evaluated_once = true;
|
1685
|
+
}
|
1686
|
+
return 0;
|
1687
|
+
}
|
1688
|
+
|
1689
|
+
int llama_tokenize(
|
1690
|
+
struct llama_context * ctx,
|
1691
|
+
const char * text,
|
1692
|
+
llama_token * tokens,
|
1693
|
+
int n_max_tokens,
|
1694
|
+
bool add_bos) {
|
1695
|
+
auto res = llama_tokenize(ctx->vocab, text, add_bos);
|
1696
|
+
|
1697
|
+
if (n_max_tokens < (int) res.size()) {
|
1698
|
+
fprintf(stderr, "%s: too many tokens\n", __func__);
|
1699
|
+
return -((int) res.size());
|
1700
|
+
}
|
1701
|
+
|
1702
|
+
for (size_t i = 0; i < res.size(); i++) {
|
1703
|
+
tokens[i] = res[i];
|
1704
|
+
}
|
1705
|
+
|
1706
|
+
return res.size();
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
int llama_n_vocab(struct llama_context * ctx) {
|
1710
|
+
return ctx->vocab.id_to_token.size();
|
1711
|
+
}
|
1712
|
+
|
1713
|
+
int llama_n_ctx(struct llama_context * ctx) {
|
1714
|
+
return ctx->model.hparams.n_ctx;
|
1715
|
+
}
|
1716
|
+
|
1717
|
+
int llama_n_embd(struct llama_context * ctx) {
|
1718
|
+
return ctx->model.hparams.n_embd;
|
1719
|
+
}
|
1720
|
+
|
1721
|
+
float * llama_get_logits(struct llama_context * ctx) {
|
1722
|
+
return ctx->logits.data();
|
1723
|
+
}
|
1724
|
+
|
1725
|
+
float * llama_get_embeddings(struct llama_context * ctx) {
|
1726
|
+
return ctx->embedding.data();
|
1727
|
+
}
|
1728
|
+
|
1729
|
+
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
|
1730
|
+
if (token >= llama_n_vocab(ctx)) {
|
1731
|
+
return nullptr;
|
1732
|
+
}
|
1733
|
+
|
1734
|
+
return ctx->vocab.id_to_token[token].tok.c_str();
|
1735
|
+
}
|
1736
|
+
|
1737
|
+
llama_token llama_token_bos() {
|
1738
|
+
return 1;
|
1739
|
+
}
|
1740
|
+
|
1741
|
+
llama_token llama_token_eos() {
|
1742
|
+
return 2;
|
1743
|
+
}
|
1744
|
+
|
1745
|
+
llama_token llama_sample_top_p_top_k(
|
1746
|
+
llama_context * ctx,
|
1747
|
+
const llama_token * last_n_tokens_data,
|
1748
|
+
int last_n_tokens_size,
|
1749
|
+
int top_k,
|
1750
|
+
float top_p,
|
1751
|
+
float temp,
|
1752
|
+
float repeat_penalty) {
|
1753
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1754
|
+
|
1755
|
+
llama_token result = 0;
|
1756
|
+
|
1757
|
+
// TODO: avoid this ...
|
1758
|
+
const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
|
1759
|
+
|
1760
|
+
result = llama_sample_top_p_top_k(
|
1761
|
+
*ctx,
|
1762
|
+
last_n_tokens,
|
1763
|
+
top_k,
|
1764
|
+
top_p,
|
1765
|
+
temp,
|
1766
|
+
repeat_penalty);
|
1767
|
+
|
1768
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
1769
|
+
ctx->n_sample++;
|
1770
|
+
|
1771
|
+
return result;
|
1772
|
+
}
|
1773
|
+
|
1774
|
+
|
1775
|
+
void llama_print_timings(struct llama_context * ctx) {
|
1776
|
+
const int64_t t_end_us = ggml_time_us();
|
1777
|
+
|
1778
|
+
const int32_t n_sample = Max(1, ctx->n_sample);
|
1779
|
+
const int32_t n_eval = Max(1, ctx->n_eval);
|
1780
|
+
const int32_t n_p_eval = Max(1, ctx->n_p_eval);
|
1781
|
+
|
1782
|
+
fprintf(stderr, "\n");
|
1783
|
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
1784
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
1785
|
+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
1786
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
1787
|
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
1788
|
+
}
|
1789
|
+
|
1790
|
+
void llama_reset_timings(struct llama_context * ctx) {
|
1791
|
+
ctx->t_start_us = ggml_time_us();
|
1792
|
+
ctx->t_sample_us = ctx->n_sample = 0;
|
1793
|
+
ctx->t_eval_us = ctx->n_eval = 0;
|
1794
|
+
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
1795
|
+
}
|
1796
|
+
|
1797
|
+
const char * llama_print_system_info(void) {
|
1798
|
+
static std::string s;
|
1799
|
+
|
1800
|
+
s = "";
|
1801
|
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
1802
|
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
1803
|
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
1804
|
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
1805
|
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
1806
|
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
1807
|
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
1808
|
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
1809
|
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
1810
|
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
1811
|
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
1812
|
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
1813
|
+
|
1814
|
+
return s.c_str();
|
1815
|
+
}
|