llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
-
#ifdef
|
10
|
+
#ifdef GGML_USE_CUDA
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
@@ -52,12 +52,16 @@
|
|
52
52
|
#define NOMINMAX
|
53
53
|
#endif
|
54
54
|
#include <windows.h>
|
55
|
+
#ifndef PATH_MAX
|
56
|
+
#define PATH_MAX MAX_PATH
|
57
|
+
#endif
|
55
58
|
#include <io.h>
|
56
59
|
#endif
|
57
60
|
|
58
61
|
#include <algorithm>
|
59
62
|
#include <array>
|
60
63
|
#include <cassert>
|
64
|
+
#include <cctype>
|
61
65
|
#include <cfloat>
|
62
66
|
#include <cinttypes>
|
63
67
|
#include <climits>
|
@@ -68,7 +72,6 @@
|
|
68
72
|
#include <cstdio>
|
69
73
|
#include <cstring>
|
70
74
|
#include <ctime>
|
71
|
-
#include <cwctype>
|
72
75
|
#include <forward_list>
|
73
76
|
#include <fstream>
|
74
77
|
#include <functional>
|
@@ -192,6 +195,7 @@ enum llm_arch {
|
|
192
195
|
LLM_ARCH_LLAMA,
|
193
196
|
LLM_ARCH_FALCON,
|
194
197
|
LLM_ARCH_BAICHUAN,
|
198
|
+
LLM_ARCH_GROK,
|
195
199
|
LLM_ARCH_GPT2,
|
196
200
|
LLM_ARCH_GPTJ,
|
197
201
|
LLM_ARCH_GPTNEOX,
|
@@ -214,6 +218,7 @@ enum llm_arch {
|
|
214
218
|
LLM_ARCH_GEMMA,
|
215
219
|
LLM_ARCH_STARCODER2,
|
216
220
|
LLM_ARCH_MAMBA,
|
221
|
+
LLM_ARCH_XVERSE,
|
217
222
|
LLM_ARCH_COMMAND_R,
|
218
223
|
LLM_ARCH_UNKNOWN,
|
219
224
|
};
|
@@ -221,6 +226,7 @@ enum llm_arch {
|
|
221
226
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
222
227
|
{ LLM_ARCH_LLAMA, "llama" },
|
223
228
|
{ LLM_ARCH_FALCON, "falcon" },
|
229
|
+
{ LLM_ARCH_GROK, "grok" },
|
224
230
|
{ LLM_ARCH_GPT2, "gpt2" },
|
225
231
|
{ LLM_ARCH_GPTJ, "gptj" },
|
226
232
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
@@ -244,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
244
250
|
{ LLM_ARCH_GEMMA, "gemma" },
|
245
251
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
246
252
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
247
254
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
248
255
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
249
256
|
};
|
@@ -254,6 +261,7 @@ enum llm_kv {
|
|
254
261
|
LLM_KV_GENERAL_ALIGNMENT,
|
255
262
|
LLM_KV_GENERAL_NAME,
|
256
263
|
LLM_KV_GENERAL_AUTHOR,
|
264
|
+
LLM_KV_GENERAL_VERSION,
|
257
265
|
LLM_KV_GENERAL_URL,
|
258
266
|
LLM_KV_GENERAL_DESCRIPTION,
|
259
267
|
LLM_KV_GENERAL_LICENSE,
|
@@ -290,6 +298,10 @@ enum llm_kv {
|
|
290
298
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
291
299
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
292
300
|
|
301
|
+
LLM_KV_SPLIT_NO,
|
302
|
+
LLM_KV_SPLIT_COUNT,
|
303
|
+
LLM_KV_SPLIT_TENSORS_COUNT,
|
304
|
+
|
293
305
|
LLM_KV_SSM_INNER_SIZE,
|
294
306
|
LLM_KV_SSM_CONV_KERNEL,
|
295
307
|
LLM_KV_SSM_STATE_SIZE,
|
@@ -306,6 +318,8 @@ enum llm_kv {
|
|
306
318
|
LLM_KV_TOKENIZER_UNK_ID,
|
307
319
|
LLM_KV_TOKENIZER_SEP_ID,
|
308
320
|
LLM_KV_TOKENIZER_PAD_ID,
|
321
|
+
LLM_KV_TOKENIZER_CLS_ID,
|
322
|
+
LLM_KV_TOKENIZER_MASK_ID,
|
309
323
|
LLM_KV_TOKENIZER_ADD_BOS,
|
310
324
|
LLM_KV_TOKENIZER_ADD_EOS,
|
311
325
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
@@ -319,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
319
333
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
320
334
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
321
335
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
336
|
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
322
337
|
{ LLM_KV_GENERAL_URL, "general.url" },
|
323
338
|
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
324
339
|
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
@@ -355,6 +370,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
355
370
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
356
371
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
357
372
|
|
373
|
+
{ LLM_KV_SPLIT_NO, "split.no" },
|
374
|
+
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
375
|
+
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
|
376
|
+
|
358
377
|
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
359
378
|
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
360
379
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
@@ -371,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
371
390
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
372
391
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
373
392
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
393
|
+
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
394
|
+
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
374
395
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
375
396
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
376
397
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
@@ -411,9 +432,12 @@ enum llm_tensor {
|
|
411
432
|
LLM_TENSOR_FFN_DOWN,
|
412
433
|
LLM_TENSOR_FFN_UP,
|
413
434
|
LLM_TENSOR_FFN_ACT,
|
414
|
-
LLM_TENSOR_FFN_DOWN_EXP,
|
435
|
+
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
415
436
|
LLM_TENSOR_FFN_GATE_EXP,
|
416
437
|
LLM_TENSOR_FFN_UP_EXP,
|
438
|
+
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
439
|
+
LLM_TENSOR_FFN_GATE_EXPS,
|
440
|
+
LLM_TENSOR_FFN_UP_EXPS,
|
417
441
|
LLM_TENSOR_ATTN_Q_NORM,
|
418
442
|
LLM_TENSOR_ATTN_K_NORM,
|
419
443
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -448,6 +472,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
448
472
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
449
473
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
450
474
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
475
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
476
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
477
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
451
478
|
},
|
452
479
|
},
|
453
480
|
{
|
@@ -483,6 +510,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
483
510
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
484
511
|
},
|
485
512
|
},
|
513
|
+
{
|
514
|
+
LLM_ARCH_GROK,
|
515
|
+
{
|
516
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
517
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
518
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
519
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
520
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
521
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
522
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
523
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
524
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
525
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
526
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
527
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
528
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
529
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
530
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
531
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
532
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
533
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
534
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
535
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
536
|
+
},
|
537
|
+
},
|
486
538
|
{
|
487
539
|
LLM_ARCH_GPT2,
|
488
540
|
{
|
@@ -548,6 +600,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
548
600
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
549
601
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
550
602
|
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
603
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
604
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
605
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
551
606
|
},
|
552
607
|
},
|
553
608
|
{
|
@@ -843,6 +898,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
843
898
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
844
899
|
},
|
845
900
|
},
|
901
|
+
{
|
902
|
+
LLM_ARCH_XVERSE,
|
903
|
+
{
|
904
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
905
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
906
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
907
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
908
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
909
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
910
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
911
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
912
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
913
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
914
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
915
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
916
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
917
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
918
|
+
},
|
919
|
+
},
|
846
920
|
{
|
847
921
|
LLM_ARCH_COMMAND_R,
|
848
922
|
{
|
@@ -856,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
856
930
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
857
931
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
858
932
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
933
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
934
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
859
935
|
},
|
860
936
|
},
|
861
937
|
{
|
@@ -1030,7 +1106,7 @@ struct llama_file {
|
|
1030
1106
|
size_t size;
|
1031
1107
|
|
1032
1108
|
llama_file(const char * fname, const char * mode) {
|
1033
|
-
fp =
|
1109
|
+
fp = ggml_fopen(fname, mode);
|
1034
1110
|
if (fp == NULL) {
|
1035
1111
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1036
1112
|
}
|
@@ -1099,6 +1175,7 @@ struct llama_file {
|
|
1099
1175
|
}
|
1100
1176
|
}
|
1101
1177
|
};
|
1178
|
+
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1102
1179
|
|
1103
1180
|
struct llama_mmap {
|
1104
1181
|
void * addr;
|
@@ -1299,6 +1376,7 @@ struct llama_mmap {
|
|
1299
1376
|
}
|
1300
1377
|
#endif
|
1301
1378
|
};
|
1379
|
+
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
1302
1380
|
|
1303
1381
|
// Represents some region of memory being locked using mlock or VirtualLock;
|
1304
1382
|
// will automatically unlock on destruction.
|
@@ -1448,6 +1526,7 @@ struct llama_mlock {
|
|
1448
1526
|
static void raw_unlock(const void * addr, size_t len) {}
|
1449
1527
|
#endif
|
1450
1528
|
};
|
1529
|
+
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1451
1530
|
|
1452
1531
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1453
1532
|
std::vector<char> result(8, 0);
|
@@ -1467,7 +1546,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1467
1546
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
1468
1547
|
ggml_backend_buffer_type_t buft = nullptr;
|
1469
1548
|
|
1470
|
-
#if defined(
|
1549
|
+
#if defined(GGML_USE_CUDA)
|
1471
1550
|
// host buffers should only be used when data is expected to be copied to/from the GPU
|
1472
1551
|
if (host_buffer) {
|
1473
1552
|
buft = ggml_backend_cuda_host_buffer_type();
|
@@ -1497,7 +1576,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1497
1576
|
|
1498
1577
|
#ifdef GGML_USE_METAL
|
1499
1578
|
buft = ggml_backend_metal_buffer_type();
|
1500
|
-
#elif defined(
|
1579
|
+
#elif defined(GGML_USE_CUDA)
|
1501
1580
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1502
1581
|
#elif defined(GGML_USE_VULKAN)
|
1503
1582
|
buft = ggml_backend_vk_buffer_type(gpu);
|
@@ -1523,7 +1602,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1523
1602
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1524
1603
|
ggml_backend_buffer_type_t buft = nullptr;
|
1525
1604
|
|
1526
|
-
#ifdef
|
1605
|
+
#ifdef GGML_USE_CUDA
|
1527
1606
|
if (ggml_backend_cuda_get_device_count() > 1) {
|
1528
1607
|
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1529
1608
|
}
|
@@ -1544,7 +1623,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1544
1623
|
}
|
1545
1624
|
|
1546
1625
|
static size_t llama_get_device_count() {
|
1547
|
-
#if defined(
|
1626
|
+
#if defined(GGML_USE_CUDA)
|
1548
1627
|
return ggml_backend_cuda_get_device_count();
|
1549
1628
|
#elif defined(GGML_USE_SYCL)
|
1550
1629
|
return ggml_backend_sycl_get_device_count();
|
@@ -1556,20 +1635,20 @@ static size_t llama_get_device_count() {
|
|
1556
1635
|
}
|
1557
1636
|
|
1558
1637
|
static size_t llama_get_device_memory(int device) {
|
1559
|
-
#if defined(
|
1638
|
+
#if defined(GGML_USE_CUDA)
|
1560
1639
|
size_t total;
|
1561
1640
|
size_t free;
|
1562
|
-
ggml_backend_cuda_get_device_memory(device, &
|
1641
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1563
1642
|
return free;
|
1564
1643
|
#elif defined(GGML_USE_SYCL)
|
1565
1644
|
size_t total;
|
1566
1645
|
size_t free;
|
1567
|
-
ggml_backend_sycl_get_device_memory(device, &
|
1646
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1568
1647
|
return free;
|
1569
1648
|
#elif defined(GGML_USE_VULKAN)
|
1570
1649
|
size_t total;
|
1571
1650
|
size_t free;
|
1572
|
-
ggml_backend_vk_get_device_memory(device, &
|
1651
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1573
1652
|
return free;
|
1574
1653
|
#else
|
1575
1654
|
return 1;
|
@@ -1621,10 +1700,13 @@ enum e_model {
|
|
1621
1700
|
MODEL_40B,
|
1622
1701
|
MODEL_65B,
|
1623
1702
|
MODEL_70B,
|
1703
|
+
MODEL_314B,
|
1624
1704
|
MODEL_SMALL,
|
1625
1705
|
MODEL_MEDIUM,
|
1626
1706
|
MODEL_LARGE,
|
1627
1707
|
MODEL_XL,
|
1708
|
+
MODEL_8x7B,
|
1709
|
+
MODEL_8x22B,
|
1628
1710
|
};
|
1629
1711
|
|
1630
1712
|
static const size_t kiB = 1024;
|
@@ -1738,6 +1820,7 @@ struct llama_cparams {
|
|
1738
1820
|
uint32_t n_ctx; // context size used during inference
|
1739
1821
|
uint32_t n_batch;
|
1740
1822
|
uint32_t n_ubatch;
|
1823
|
+
uint32_t n_seq_max;
|
1741
1824
|
uint32_t n_threads; // number of threads to use for generation
|
1742
1825
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1743
1826
|
|
@@ -1803,9 +1886,9 @@ struct llama_layer {
|
|
1803
1886
|
|
1804
1887
|
// ff MoE
|
1805
1888
|
struct ggml_tensor * ffn_gate_inp;
|
1806
|
-
struct ggml_tensor *
|
1807
|
-
struct ggml_tensor *
|
1808
|
-
struct ggml_tensor *
|
1889
|
+
struct ggml_tensor * ffn_gate_exps;
|
1890
|
+
struct ggml_tensor * ffn_down_exps;
|
1891
|
+
struct ggml_tensor * ffn_up_exps ;
|
1809
1892
|
|
1810
1893
|
// ff bias
|
1811
1894
|
struct ggml_tensor * ffn_down_b; // b2
|
@@ -1941,11 +2024,13 @@ struct llama_vocab {
|
|
1941
2024
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
1942
2025
|
|
1943
2026
|
// default LLaMA special tokens
|
1944
|
-
id special_bos_id
|
1945
|
-
id special_eos_id
|
1946
|
-
id special_unk_id
|
1947
|
-
id special_sep_id
|
1948
|
-
id special_pad_id
|
2027
|
+
id special_bos_id = 1;
|
2028
|
+
id special_eos_id = 2;
|
2029
|
+
id special_unk_id = 0;
|
2030
|
+
id special_sep_id = -1;
|
2031
|
+
id special_pad_id = -1;
|
2032
|
+
id special_cls_id = -1;
|
2033
|
+
id special_mask_id = -1;
|
1949
2034
|
|
1950
2035
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
1951
2036
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
@@ -2023,12 +2108,12 @@ struct llama_model {
|
|
2023
2108
|
// the model memory buffers for the tensor data
|
2024
2109
|
std::vector<ggml_backend_buffer_t> bufs;
|
2025
2110
|
|
2026
|
-
// model memory mapped
|
2027
|
-
|
2111
|
+
// model memory mapped files
|
2112
|
+
llama_mmaps mappings;
|
2028
2113
|
|
2029
2114
|
// objects representing data potentially being locked in memory
|
2030
|
-
|
2031
|
-
|
2115
|
+
llama_mlocks mlock_bufs;
|
2116
|
+
llama_mlocks mlock_mmaps;
|
2032
2117
|
|
2033
2118
|
// for quantize-stats only
|
2034
2119
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
@@ -2041,7 +2126,7 @@ struct llama_model {
|
|
2041
2126
|
ggml_free(ctx);
|
2042
2127
|
}
|
2043
2128
|
for (ggml_backend_buffer_t buf : bufs) {
|
2044
|
-
#ifdef
|
2129
|
+
#ifdef GGML_USE_CUDA
|
2045
2130
|
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
2046
2131
|
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
2047
2132
|
}
|
@@ -2060,10 +2145,6 @@ struct llama_context {
|
|
2060
2145
|
ggml_backend_free(backend);
|
2061
2146
|
}
|
2062
2147
|
|
2063
|
-
#ifdef GGML_USE_VULKAN
|
2064
|
-
ggml_vk_free_cpu_assist();
|
2065
|
-
#endif
|
2066
|
-
|
2067
2148
|
ggml_backend_buffer_free(buf_output);
|
2068
2149
|
}
|
2069
2150
|
|
@@ -2100,20 +2181,20 @@ struct llama_context {
|
|
2100
2181
|
// host buffer for the model output (logits and embeddings)
|
2101
2182
|
ggml_backend_buffer_t buf_output = nullptr;
|
2102
2183
|
|
2103
|
-
// decode output (2-dimensional array: [
|
2104
|
-
size_t
|
2105
|
-
float * logits
|
2184
|
+
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
2185
|
+
size_t logits_size = 0; // capacity (of floats) for logits
|
2186
|
+
float * logits = nullptr;
|
2187
|
+
|
2188
|
+
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2189
|
+
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2190
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
2106
2191
|
|
2107
|
-
#ifndef NDEBUG
|
2108
|
-
// guard against access to unset logits
|
2109
|
-
std::vector<bool> logits_valid;
|
2110
|
-
#endif
|
2111
2192
|
bool logits_all = false;
|
2112
2193
|
|
2113
|
-
// embeddings output (2-dimensional array: [
|
2194
|
+
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
2114
2195
|
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
2115
|
-
size_t
|
2116
|
-
float * embd
|
2196
|
+
size_t embd_size = 0; // capacity (of floats) for embeddings
|
2197
|
+
float * embd = nullptr;
|
2117
2198
|
|
2118
2199
|
// sequence embeddings output (map of [n_embd] vectors)
|
2119
2200
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
@@ -2130,14 +2211,15 @@ struct llama_context {
|
|
2130
2211
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
2131
2212
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
2132
2213
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2214
|
+
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2133
2215
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2134
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [
|
2216
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2135
2217
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2136
2218
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2137
2219
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
2138
2220
|
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
2139
|
-
struct ggml_tensor * inp_s_mask; // F32 [1,
|
2140
|
-
struct ggml_tensor * inp_s_seq; // I32 [
|
2221
|
+
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
2222
|
+
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
2141
2223
|
|
2142
2224
|
// control vectors
|
2143
2225
|
struct llama_control_vector cvec;
|
@@ -2792,6 +2874,8 @@ namespace GGUFMeta {
|
|
2792
2874
|
};
|
2793
2875
|
}
|
2794
2876
|
|
2877
|
+
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
2878
|
+
|
2795
2879
|
struct llama_model_loader {
|
2796
2880
|
int n_kv = 0;
|
2797
2881
|
int n_tensors = 0;
|
@@ -2802,54 +2886,133 @@ struct llama_model_loader {
|
|
2802
2886
|
|
2803
2887
|
bool use_mmap = false;
|
2804
2888
|
|
2805
|
-
|
2889
|
+
llama_files files;
|
2806
2890
|
llama_ftype ftype;
|
2807
2891
|
llama_fver fver;
|
2808
2892
|
|
2809
|
-
|
2893
|
+
llama_mmaps mappings;
|
2894
|
+
|
2895
|
+
// Holds information on a model weight
|
2896
|
+
struct llama_tensor_weight {
|
2897
|
+
uint16_t idx; // source file index
|
2898
|
+
size_t offs; // tensor data offset in the original file
|
2899
|
+
|
2900
|
+
ggml_tensor * tensor;
|
2901
|
+
|
2902
|
+
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2903
|
+
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2904
|
+
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
2905
|
+
}
|
2906
|
+
};
|
2907
|
+
std::vector<llama_tensor_weight> weights;
|
2908
|
+
|
2810
2909
|
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
2811
2910
|
|
2812
|
-
struct gguf_context *
|
2813
|
-
|
2911
|
+
struct gguf_context * meta = NULL;
|
2912
|
+
std::vector<ggml_context *> contexts;
|
2814
2913
|
|
2815
2914
|
std::string arch_name;
|
2816
2915
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2817
2916
|
|
2818
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p)
|
2917
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
2819
2918
|
int trace = 0;
|
2820
2919
|
if (getenv("LLAMA_TRACE")) {
|
2821
2920
|
trace = atoi(getenv("LLAMA_TRACE"));
|
2822
2921
|
}
|
2823
2922
|
|
2824
|
-
struct gguf_init_params params = {
|
2825
|
-
/*.no_alloc = */ true,
|
2826
|
-
/*.ctx = */ &ctx_meta,
|
2827
|
-
};
|
2828
|
-
|
2829
2923
|
if (param_overrides_p != nullptr) {
|
2830
2924
|
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
2831
2925
|
kv_overrides.insert({std::string(p->key), *p});
|
2832
2926
|
}
|
2833
2927
|
}
|
2834
2928
|
|
2835
|
-
|
2836
|
-
|
2929
|
+
struct ggml_context * ctx = NULL;
|
2930
|
+
struct gguf_init_params params = {
|
2931
|
+
/*.no_alloc = */ true,
|
2932
|
+
/*.ctx = */ &ctx,
|
2933
|
+
};
|
2934
|
+
|
2935
|
+
meta = gguf_init_from_file(fname.c_str(), params);
|
2936
|
+
if (!meta) {
|
2837
2937
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
2838
2938
|
}
|
2839
2939
|
|
2840
2940
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2841
2941
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2842
2942
|
|
2843
|
-
|
2844
|
-
|
2943
|
+
// Save tensors data offset of the main file.
|
2944
|
+
// For subsidiary files, `meta` tensor data offset must not be used,
|
2945
|
+
// so we build a unified tensors index for weights.
|
2946
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2947
|
+
weights.emplace_back(0, cur->name, meta, cur);
|
2948
|
+
}
|
2949
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
2950
|
+
contexts.emplace_back(ctx);
|
2951
|
+
|
2952
|
+
uint16_t n_split = 0;
|
2953
|
+
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
2954
|
+
|
2955
|
+
// Load additional GGML contexts
|
2956
|
+
if (n_split > 1) {
|
2957
|
+
uint16_t idx = 0;
|
2958
|
+
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
2959
|
+
if (idx != 0) {
|
2960
|
+
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
2961
|
+
}
|
2962
|
+
|
2963
|
+
char split_prefix[PATH_MAX] = {0};
|
2964
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
|
2965
|
+
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
2966
|
+
}
|
2967
|
+
|
2968
|
+
if (trace > 0) {
|
2969
|
+
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
2970
|
+
}
|
2971
|
+
|
2972
|
+
char split_path[PATH_MAX] = {0};
|
2973
|
+
for (idx = 1; idx < n_split; idx++) {
|
2974
|
+
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
2975
|
+
|
2976
|
+
struct gguf_init_params split_params = {
|
2977
|
+
/*.no_alloc = */ true,
|
2978
|
+
/*.ctx = */ &ctx,
|
2979
|
+
};
|
2980
|
+
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
|
2981
|
+
if (!ctx_gguf) {
|
2982
|
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
2983
|
+
}
|
2984
|
+
|
2985
|
+
// Save tensors data offset info of the shard.
|
2986
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2987
|
+
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
2988
|
+
}
|
2989
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
2990
|
+
contexts.emplace_back(ctx);
|
2991
|
+
|
2992
|
+
gguf_free(ctx_gguf);
|
2993
|
+
}
|
2994
|
+
|
2995
|
+
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
2845
2996
|
|
2846
|
-
|
2997
|
+
// sanity check
|
2998
|
+
{
|
2999
|
+
const int n_tensors_loaded = (int) weights.size();
|
3000
|
+
if (n_tensors != n_tensors_loaded) {
|
3001
|
+
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
3002
|
+
}
|
3003
|
+
}
|
3004
|
+
|
3005
|
+
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
3006
|
+
}
|
3007
|
+
|
3008
|
+
n_kv = gguf_get_n_kv(meta);
|
3009
|
+
n_tensors = weights.size();
|
2847
3010
|
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
n_elements += ggml_nelements(
|
2852
|
-
n_bytes += ggml_nbytes(
|
3011
|
+
fver = (enum llama_fver) gguf_get_version(meta);
|
3012
|
+
|
3013
|
+
for (auto & w : weights) {
|
3014
|
+
n_elements += ggml_nelements(w.tensor);
|
3015
|
+
n_bytes += ggml_nbytes(w.tensor);
|
2853
3016
|
}
|
2854
3017
|
|
2855
3018
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -2864,7 +3027,8 @@ struct llama_model_loader {
|
|
2864
3027
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2865
3028
|
|
2866
3029
|
for (int i = 0; i < n_tensors; i++) {
|
2867
|
-
|
3030
|
+
const ggml_tensor * tensor = weights.at(i).tensor;
|
3031
|
+
enum ggml_type type = tensor->type;
|
2868
3032
|
|
2869
3033
|
n_type[type]++;
|
2870
3034
|
|
@@ -2874,8 +3038,8 @@ struct llama_model_loader {
|
|
2874
3038
|
}
|
2875
3039
|
|
2876
3040
|
if (trace > 0) {
|
2877
|
-
|
2878
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(
|
3041
|
+
const uint16_t sid = weights.at(i).idx;
|
3042
|
+
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
2879
3043
|
}
|
2880
3044
|
}
|
2881
3045
|
|
@@ -2897,6 +3061,7 @@ struct llama_model_loader {
|
|
2897
3061
|
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2898
3062
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2899
3063
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
3064
|
+
case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
|
2900
3065
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2901
3066
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2902
3067
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
@@ -2911,22 +3076,23 @@ struct llama_model_loader {
|
|
2911
3076
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
2912
3077
|
|
2913
3078
|
{
|
2914
|
-
const int kid = gguf_find_key(
|
3079
|
+
const int kid = gguf_find_key(meta, "general.file_type");
|
2915
3080
|
if (kid >= 0) {
|
2916
|
-
ftype = (llama_ftype) gguf_get_val_u32(
|
3081
|
+
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
2917
3082
|
}
|
2918
3083
|
}
|
2919
3084
|
|
2920
3085
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
3086
|
+
|
2921
3087
|
for (int i = 0; i < n_kv; i++) {
|
2922
|
-
const char * name = gguf_get_key(
|
2923
|
-
const enum gguf_type type = gguf_get_kv_type(
|
3088
|
+
const char * name = gguf_get_key(meta, i);
|
3089
|
+
const enum gguf_type type = gguf_get_kv_type(meta, i);
|
2924
3090
|
const std::string type_name =
|
2925
3091
|
type == GGUF_TYPE_ARRAY
|
2926
|
-
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(
|
3092
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
|
2927
3093
|
: gguf_type_name(type);
|
2928
3094
|
|
2929
|
-
std::string value = gguf_kv_to_str(
|
3095
|
+
std::string value = gguf_kv_to_str(meta, i);
|
2930
3096
|
const size_t MAX_VALUE_LEN = 40;
|
2931
3097
|
if (value.size() > MAX_VALUE_LEN) {
|
2932
3098
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
@@ -2955,18 +3121,18 @@ struct llama_model_loader {
|
|
2955
3121
|
}
|
2956
3122
|
|
2957
3123
|
~llama_model_loader() {
|
2958
|
-
if (
|
2959
|
-
gguf_free(
|
3124
|
+
if (meta) {
|
3125
|
+
gguf_free(meta);
|
2960
3126
|
}
|
2961
|
-
|
2962
|
-
ggml_free(
|
3127
|
+
for (auto * ctx : contexts) {
|
3128
|
+
ggml_free(ctx);
|
2963
3129
|
}
|
2964
3130
|
}
|
2965
3131
|
|
2966
3132
|
template<typename T>
|
2967
3133
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2968
3134
|
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2969
|
-
const int kid = gguf_find_key(
|
3135
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
2970
3136
|
|
2971
3137
|
if (kid < 0) {
|
2972
3138
|
if (required) {
|
@@ -2976,7 +3142,7 @@ struct llama_model_loader {
|
|
2976
3142
|
}
|
2977
3143
|
|
2978
3144
|
struct GGUFMeta::ArrayInfo arr_info =
|
2979
|
-
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(
|
3145
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
2980
3146
|
|
2981
3147
|
|
2982
3148
|
result = arr_info.length;
|
@@ -2996,7 +3162,7 @@ struct llama_model_loader {
|
|
2996
3162
|
const struct llama_model_kv_override * override =
|
2997
3163
|
it != kv_overrides.end() ? &it->second : nullptr;
|
2998
3164
|
|
2999
|
-
const bool found = GGUFMeta::GKV<T>::set(
|
3165
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
3000
3166
|
|
3001
3167
|
if (required && !found) {
|
3002
3168
|
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
@@ -3019,28 +3185,57 @@ struct llama_model_loader {
|
|
3019
3185
|
}
|
3020
3186
|
|
3021
3187
|
const char * get_tensor_name(int i) const {
|
3022
|
-
return
|
3188
|
+
return weights.at(i).tensor->name;
|
3189
|
+
}
|
3190
|
+
|
3191
|
+
const llama_tensor_weight * get_weight(const char * name) const {
|
3192
|
+
for (const auto & weight : weights) {
|
3193
|
+
if (strcmp(name, weight.tensor->name) == 0) {
|
3194
|
+
return &weight;
|
3195
|
+
}
|
3196
|
+
}
|
3197
|
+
return nullptr;
|
3198
|
+
}
|
3199
|
+
|
3200
|
+
const llama_tensor_weight & require_weight(const char * name) const {
|
3201
|
+
const llama_tensor_weight * weight = get_weight(name);
|
3202
|
+
if (!weight) {
|
3203
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3204
|
+
}
|
3205
|
+
return *weight;
|
3023
3206
|
}
|
3024
3207
|
|
3025
3208
|
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
3026
|
-
|
3209
|
+
const auto * weight = get_weight(name);
|
3210
|
+
if (!weight) {
|
3211
|
+
return nullptr;
|
3212
|
+
}
|
3213
|
+
return weight->tensor;
|
3214
|
+
}
|
3215
|
+
|
3216
|
+
struct ggml_tensor * require_tensor_meta(const char * name) const {
|
3217
|
+
struct ggml_tensor * tensor = get_tensor_meta(name);
|
3218
|
+
if (!tensor) {
|
3219
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3220
|
+
}
|
3221
|
+
return tensor;
|
3027
3222
|
}
|
3028
3223
|
|
3029
3224
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
3030
3225
|
return get_tensor_meta(get_tensor_name(i));
|
3031
3226
|
}
|
3032
3227
|
|
3033
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor *
|
3034
|
-
struct ggml_tensor * tensor = ggml_dup_tensor(ctx,
|
3035
|
-
ggml_set_name(tensor, ggml_get_name(
|
3228
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3229
|
+
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3230
|
+
ggml_set_name(tensor, ggml_get_name(cur));
|
3036
3231
|
|
3037
3232
|
n_created++;
|
3038
3233
|
|
3039
3234
|
return tensor;
|
3040
3235
|
}
|
3041
3236
|
|
3042
|
-
struct ggml_tensor *
|
3043
|
-
struct ggml_tensor * cur =
|
3237
|
+
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
|
3238
|
+
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
3044
3239
|
|
3045
3240
|
if (cur == NULL) {
|
3046
3241
|
if (!required) {
|
@@ -3051,8 +3246,8 @@ struct llama_model_loader {
|
|
3051
3246
|
|
3052
3247
|
{
|
3053
3248
|
bool is_ok = true;
|
3054
|
-
for (size_t i = 0; i <
|
3055
|
-
if (ne[i] != cur->ne[i]) {
|
3249
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3250
|
+
if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
|
3056
3251
|
is_ok = false;
|
3057
3252
|
break;
|
3058
3253
|
}
|
@@ -3066,127 +3261,196 @@ struct llama_model_loader {
|
|
3066
3261
|
}
|
3067
3262
|
}
|
3068
3263
|
|
3069
|
-
return
|
3264
|
+
return cur;
|
3070
3265
|
}
|
3071
3266
|
|
3072
|
-
|
3073
|
-
|
3074
|
-
|
3267
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
3268
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3269
|
+
|
3270
|
+
if (cur == NULL) {
|
3271
|
+
return NULL;
|
3075
3272
|
}
|
3273
|
+
|
3274
|
+
return create_tensor_for(ctx, cur);
|
3076
3275
|
}
|
3077
3276
|
|
3078
|
-
|
3079
|
-
const
|
3277
|
+
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
3278
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3080
3279
|
|
3081
|
-
if (
|
3082
|
-
|
3280
|
+
if (cur == NULL) {
|
3281
|
+
return NULL;
|
3083
3282
|
}
|
3084
3283
|
|
3085
|
-
|
3086
|
-
|
3284
|
+
if (cur->type != base->type) {
|
3285
|
+
throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
|
3286
|
+
}
|
3087
3287
|
|
3088
|
-
|
3089
|
-
|
3090
|
-
|
3091
|
-
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
3288
|
+
std::array<int64_t, GGML_MAX_DIMS> dims;
|
3289
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3290
|
+
dims[i] = i < ne.size() ? ne[i] : 1;
|
3092
3291
|
}
|
3093
3292
|
|
3094
|
-
|
3095
|
-
|
3096
|
-
|
3097
|
-
|
3293
|
+
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
3294
|
+
dims[0], dims[1], dims[2], dims[3],
|
3295
|
+
cur->nb[1], cur->nb[2], cur->nb[3],
|
3296
|
+
offset);
|
3297
|
+
|
3298
|
+
ggml_set_name(tensor, name.c_str());
|
3299
|
+
|
3300
|
+
n_created++;
|
3301
|
+
|
3302
|
+
return tensor;
|
3303
|
+
}
|
3304
|
+
|
3305
|
+
void done_getting_tensors() const {
|
3306
|
+
if (n_created != n_tensors) {
|
3307
|
+
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
3098
3308
|
}
|
3309
|
+
}
|
3099
3310
|
|
3100
|
-
|
3101
|
-
|
3102
|
-
|
3311
|
+
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
|
3312
|
+
if (use_mmap) {
|
3313
|
+
mappings.reserve(files.size());
|
3314
|
+
mmaps_used.reserve(files.size());
|
3315
|
+
for (const auto & file : files) {
|
3316
|
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
3317
|
+
mmaps_used.emplace_back(mapping->size, 0);
|
3318
|
+
if (mlock_mmaps) {
|
3319
|
+
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
3320
|
+
mlock_mmap->init(mapping->addr);
|
3321
|
+
mlock_mmaps->emplace_back(std::move(mlock_mmap));
|
3322
|
+
}
|
3323
|
+
mappings.emplace_back(std::move(mapping));
|
3103
3324
|
}
|
3104
|
-
|
3325
|
+
}
|
3326
|
+
|
3327
|
+
// compute the total size of all tensors for progress reporting
|
3328
|
+
for (auto & w : weights) {
|
3329
|
+
size_data += ggml_nbytes(w.tensor);
|
3105
3330
|
}
|
3106
3331
|
}
|
3107
3332
|
|
3108
|
-
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
3109
|
-
GGML_ASSERT(
|
3333
|
+
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
|
3334
|
+
GGML_ASSERT(!mappings.empty());
|
3335
|
+
const auto & mapping = mappings.at(idx);
|
3110
3336
|
|
3111
3337
|
*first = mapping->size;
|
3112
3338
|
*last = 0;
|
3339
|
+
*addr = mapping->addr;
|
3113
3340
|
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
3114
|
-
|
3115
|
-
|
3116
|
-
|
3341
|
+
try {
|
3342
|
+
const auto * weight = get_weight(ggml_get_name(tensor));
|
3343
|
+
if (!weight) {
|
3344
|
+
continue;
|
3345
|
+
}
|
3346
|
+
if (weight->idx != idx) {
|
3347
|
+
continue;
|
3348
|
+
}
|
3349
|
+
*first = std::min(*first, weight->offs);
|
3350
|
+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
3351
|
+
} catch(...) {
|
3352
|
+
// the tensor is not in the model
|
3353
|
+
}
|
3117
3354
|
}
|
3118
3355
|
}
|
3119
3356
|
|
3120
3357
|
// for backwards compatibility, does not support ggml-backend
|
3121
3358
|
void load_data_for(struct ggml_tensor * cur) const {
|
3122
|
-
const
|
3359
|
+
const auto & w = require_weight(ggml_get_name(cur));
|
3123
3360
|
|
3124
|
-
if (use_mmap
|
3361
|
+
if (use_mmap) {
|
3362
|
+
const auto & mapping = mappings.at(w.idx);
|
3125
3363
|
if (cur->data == nullptr) {
|
3126
|
-
cur->data = (uint8_t *)mapping->addr + offs;
|
3364
|
+
cur->data = (uint8_t *)mapping->addr + w.offs;
|
3127
3365
|
} else {
|
3128
|
-
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
3366
|
+
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
3129
3367
|
}
|
3130
3368
|
} else {
|
3131
3369
|
GGML_ASSERT(cur->data != nullptr);
|
3132
|
-
|
3133
|
-
file.
|
3370
|
+
GGML_ASSERT(w.idx < files.size());
|
3371
|
+
const auto & file = files.at(w.idx);
|
3372
|
+
file->seek(w.offs, SEEK_SET);
|
3373
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3134
3374
|
}
|
3135
3375
|
}
|
3136
3376
|
|
3137
3377
|
size_t size_done = 0;
|
3138
3378
|
size_t size_data = 0;
|
3139
|
-
size_t
|
3140
|
-
size_t mmap_used_last = 0;
|
3379
|
+
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
3141
3380
|
|
3142
3381
|
// Returns false if cancelled by progress_callback
|
3143
|
-
bool load_all_data(
|
3144
|
-
|
3382
|
+
bool load_all_data(
|
3383
|
+
struct ggml_context * ctx,
|
3384
|
+
llama_buf_map & bufs_mmap,
|
3385
|
+
llama_mlocks * lmlocks,
|
3386
|
+
llama_progress_callback progress_callback,
|
3387
|
+
void * progress_callback_user_data) {
|
3388
|
+
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3145
3389
|
|
3146
3390
|
std::vector<no_init<uint8_t>> read_buf;
|
3147
|
-
|
3148
3391
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3392
|
+
const auto * weight = get_weight(ggml_get_name(cur));
|
3393
|
+
if (weight == nullptr) {
|
3394
|
+
// this can happen with split experts models
|
3395
|
+
continue;
|
3396
|
+
}
|
3397
|
+
|
3149
3398
|
if (progress_callback) {
|
3150
3399
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
3151
3400
|
return false;
|
3152
3401
|
}
|
3153
3402
|
}
|
3154
3403
|
|
3155
|
-
|
3404
|
+
size_t n_size = ggml_nbytes(cur);
|
3156
3405
|
|
3157
|
-
if (use_mmap
|
3406
|
+
if (use_mmap) {
|
3407
|
+
const auto & mapping = mappings.at(weight->idx);
|
3408
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3409
|
+
if (bufs_mmap.count(weight->idx)) {
|
3410
|
+
buf_mmap = bufs_mmap.at(weight->idx);
|
3411
|
+
}
|
3412
|
+
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3158
3413
|
if (buf_mmap && cur->data == nullptr) {
|
3159
|
-
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
3160
|
-
if (
|
3161
|
-
lmlock
|
3414
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
|
3415
|
+
if (lmlocks) {
|
3416
|
+
const auto & lmlock = lmlocks->at(weight->idx);
|
3417
|
+
lmlock->grow_to(weight->offs + ggml_nbytes(cur));
|
3162
3418
|
}
|
3163
|
-
|
3164
|
-
|
3419
|
+
|
3420
|
+
auto & mmap_used = mmaps_used[weight->idx];
|
3421
|
+
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3422
|
+
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3165
3423
|
} else {
|
3166
|
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0,
|
3424
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
|
3167
3425
|
}
|
3168
3426
|
} else {
|
3427
|
+
GGML_ASSERT(weight->idx < files.size());
|
3428
|
+
const auto & file = files.at(weight->idx);
|
3169
3429
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3170
|
-
file
|
3171
|
-
file
|
3430
|
+
file->seek(weight->offs, SEEK_SET);
|
3431
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3172
3432
|
} else {
|
3173
3433
|
read_buf.resize(ggml_nbytes(cur));
|
3174
|
-
file
|
3175
|
-
file
|
3176
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0,
|
3434
|
+
file->seek(weight->offs, SEEK_SET);
|
3435
|
+
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
3436
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3177
3437
|
}
|
3178
3438
|
}
|
3179
3439
|
|
3180
|
-
size_done +=
|
3440
|
+
size_done += n_size;
|
3181
3441
|
}
|
3182
3442
|
|
3183
3443
|
// check if this is the last call and do final cleanup
|
3184
3444
|
if (size_done >= size_data) {
|
3185
3445
|
// unmap offloaded tensors and metadata
|
3186
|
-
if (use_mmap
|
3187
|
-
|
3188
|
-
|
3189
|
-
mapping
|
3446
|
+
if (use_mmap) {
|
3447
|
+
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
|
3448
|
+
const auto & mmap_used = mmaps_used.at(idx);
|
3449
|
+
auto & mapping = mappings.at(idx);
|
3450
|
+
mapping->unmap_fragment(0, mmap_used.first);
|
3451
|
+
if (mmap_used.second != 0) {
|
3452
|
+
mapping->unmap_fragment(mmap_used.second, mapping->size);
|
3453
|
+
}
|
3190
3454
|
}
|
3191
3455
|
}
|
3192
3456
|
if (progress_callback) {
|
@@ -3259,6 +3523,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3259
3523
|
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
3260
3524
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
3261
3525
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
3526
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
|
3262
3527
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
3263
3528
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
3264
3529
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
@@ -3290,10 +3555,13 @@ static const char * llama_model_type_name(e_model type) {
|
|
3290
3555
|
case MODEL_40B: return "40B";
|
3291
3556
|
case MODEL_65B: return "65B";
|
3292
3557
|
case MODEL_70B: return "70B";
|
3558
|
+
case MODEL_314B: return "314B";
|
3293
3559
|
case MODEL_SMALL: return "0.1B";
|
3294
3560
|
case MODEL_MEDIUM: return "0.4B";
|
3295
3561
|
case MODEL_LARGE: return "0.8B";
|
3296
3562
|
case MODEL_XL: return "1.5B";
|
3563
|
+
case MODEL_8x7B: return "8x7B";
|
3564
|
+
case MODEL_8x22B: return "8x22B";
|
3297
3565
|
default: return "?B";
|
3298
3566
|
}
|
3299
3567
|
}
|
@@ -3319,7 +3587,7 @@ static void llm_load_hparams(
|
|
3319
3587
|
llama_model_loader & ml,
|
3320
3588
|
llama_model & model) {
|
3321
3589
|
auto & hparams = model.hparams;
|
3322
|
-
const gguf_context * ctx = ml.
|
3590
|
+
const gguf_context * ctx = ml.meta;
|
3323
3591
|
|
3324
3592
|
// get metadata as string
|
3325
3593
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -3408,15 +3676,23 @@ static void llm_load_hparams(
|
|
3408
3676
|
{
|
3409
3677
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3410
3678
|
|
3411
|
-
|
3412
|
-
|
3413
|
-
|
3414
|
-
|
3415
|
-
|
3416
|
-
|
3417
|
-
|
3418
|
-
|
3419
|
-
|
3679
|
+
if (hparams.n_expert == 8) {
|
3680
|
+
switch (hparams.n_layer) {
|
3681
|
+
case 32: model.type = e_model::MODEL_8x7B; break;
|
3682
|
+
case 56: model.type = e_model::MODEL_8x22B; break;
|
3683
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3684
|
+
}
|
3685
|
+
} else {
|
3686
|
+
switch (hparams.n_layer) {
|
3687
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
3688
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
3689
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3690
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3691
|
+
case 48: model.type = e_model::MODEL_34B; break;
|
3692
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
3693
|
+
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
3694
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3695
|
+
}
|
3420
3696
|
}
|
3421
3697
|
} break;
|
3422
3698
|
case LLM_ARCH_MINICPM:
|
@@ -3428,6 +3704,15 @@ static void llm_load_hparams(
|
|
3428
3704
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3429
3705
|
}
|
3430
3706
|
} break;
|
3707
|
+
case LLM_ARCH_GROK:
|
3708
|
+
{
|
3709
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3710
|
+
|
3711
|
+
switch (hparams.n_layer) {
|
3712
|
+
case 64: model.type = e_model::MODEL_314B; break;
|
3713
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3714
|
+
}
|
3715
|
+
} break;
|
3431
3716
|
case LLM_ARCH_FALCON:
|
3432
3717
|
{
|
3433
3718
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3679,6 +3964,16 @@ static void llm_load_hparams(
|
|
3679
3964
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3680
3965
|
}
|
3681
3966
|
} break;
|
3967
|
+
case LLM_ARCH_XVERSE:
|
3968
|
+
{
|
3969
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3970
|
+
switch (hparams.n_layer) {
|
3971
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3972
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3973
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
3974
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3975
|
+
}
|
3976
|
+
} break;
|
3682
3977
|
case LLM_ARCH_COMMAND_R:
|
3683
3978
|
{
|
3684
3979
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
@@ -3701,7 +3996,9 @@ static void llm_load_hparams(
|
|
3701
3996
|
}
|
3702
3997
|
|
3703
3998
|
// TODO: This should probably be in llama.h
|
3704
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
3999
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
4000
|
+
const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
|
4001
|
+
);
|
3705
4002
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
3706
4003
|
|
3707
4004
|
static void llm_load_vocab(
|
@@ -3709,7 +4006,7 @@ static void llm_load_vocab(
|
|
3709
4006
|
llama_model & model) {
|
3710
4007
|
auto & vocab = model.vocab;
|
3711
4008
|
|
3712
|
-
struct gguf_context * ctx = ml.
|
4009
|
+
struct gguf_context * ctx = ml.meta;
|
3713
4010
|
|
3714
4011
|
const auto kv = LLM_KV(model.arch);
|
3715
4012
|
|
@@ -3723,23 +4020,27 @@ static void llm_load_vocab(
|
|
3723
4020
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
3724
4021
|
|
3725
4022
|
// default special tokens
|
3726
|
-
vocab.special_bos_id
|
3727
|
-
vocab.special_eos_id
|
3728
|
-
vocab.special_unk_id
|
3729
|
-
vocab.special_sep_id
|
3730
|
-
vocab.special_pad_id
|
3731
|
-
vocab.
|
4023
|
+
vocab.special_bos_id = -1;
|
4024
|
+
vocab.special_eos_id = -1;
|
4025
|
+
vocab.special_unk_id = -1;
|
4026
|
+
vocab.special_sep_id = -1;
|
4027
|
+
vocab.special_pad_id = -1;
|
4028
|
+
vocab.special_cls_id = -1;
|
4029
|
+
vocab.special_mask_id = -1;
|
4030
|
+
vocab.linefeed_id = -1;
|
3732
4031
|
|
3733
4032
|
return;
|
3734
4033
|
} else if (tokenizer_name == "llama") {
|
3735
4034
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
3736
4035
|
|
3737
4036
|
// default special tokens
|
3738
|
-
vocab.special_bos_id
|
3739
|
-
vocab.special_eos_id
|
3740
|
-
vocab.special_unk_id
|
3741
|
-
vocab.special_sep_id
|
3742
|
-
vocab.special_pad_id
|
4037
|
+
vocab.special_bos_id = 1;
|
4038
|
+
vocab.special_eos_id = 2;
|
4039
|
+
vocab.special_unk_id = 0;
|
4040
|
+
vocab.special_sep_id = -1;
|
4041
|
+
vocab.special_pad_id = -1;
|
4042
|
+
vocab.special_cls_id = -1;
|
4043
|
+
vocab.special_mask_id = -1;
|
3743
4044
|
|
3744
4045
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
3745
4046
|
if (add_space_prefix_keyidx != -1) {
|
@@ -3774,20 +4075,24 @@ static void llm_load_vocab(
|
|
3774
4075
|
}
|
3775
4076
|
|
3776
4077
|
// default special tokens
|
3777
|
-
vocab.special_bos_id
|
3778
|
-
vocab.special_eos_id
|
3779
|
-
vocab.special_unk_id
|
3780
|
-
vocab.special_sep_id
|
3781
|
-
vocab.special_pad_id
|
4078
|
+
vocab.special_bos_id = 11;
|
4079
|
+
vocab.special_eos_id = 11;
|
4080
|
+
vocab.special_unk_id = -1;
|
4081
|
+
vocab.special_sep_id = -1;
|
4082
|
+
vocab.special_pad_id = -1;
|
4083
|
+
vocab.special_cls_id = -1;
|
4084
|
+
vocab.special_mask_id = -1;
|
3782
4085
|
} else if (tokenizer_name == "bert") {
|
3783
4086
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
3784
4087
|
|
3785
4088
|
// default special tokens
|
3786
|
-
vocab.special_bos_id
|
3787
|
-
vocab.special_eos_id
|
3788
|
-
vocab.special_unk_id
|
3789
|
-
vocab.special_sep_id
|
3790
|
-
vocab.special_pad_id
|
4089
|
+
vocab.special_bos_id = -1;
|
4090
|
+
vocab.special_eos_id = -1;
|
4091
|
+
vocab.special_unk_id = 100;
|
4092
|
+
vocab.special_sep_id = 102;
|
4093
|
+
vocab.special_pad_id = 0;
|
4094
|
+
vocab.special_cls_id = 101;
|
4095
|
+
vocab.special_mask_id = 103;
|
3791
4096
|
vocab.add_space_prefix = false;
|
3792
4097
|
} else {
|
3793
4098
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
@@ -3842,7 +4147,7 @@ static void llm_load_vocab(
|
|
3842
4147
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3843
4148
|
vocab.linefeed_id = vocab.special_pad_id;
|
3844
4149
|
} else {
|
3845
|
-
const std::vector<int> ids = llama_tokenize_internal(vocab, "\
|
4150
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
3846
4151
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
3847
4152
|
vocab.linefeed_id = ids[0];
|
3848
4153
|
}
|
@@ -3850,11 +4155,13 @@ static void llm_load_vocab(
|
|
3850
4155
|
// special tokens
|
3851
4156
|
{
|
3852
4157
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
3853
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
3854
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
3855
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
3856
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
3857
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4158
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4159
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4160
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4161
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4162
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4163
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4164
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
3858
4165
|
};
|
3859
4166
|
for (const auto & it : special_token_types) {
|
3860
4167
|
const std::string & key = kv(std::get<0>(it));
|
@@ -4046,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4046
4353
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4047
4354
|
|
4048
4355
|
// special tokens
|
4049
|
-
if (vocab.special_bos_id
|
4050
|
-
if (vocab.special_eos_id
|
4051
|
-
if (vocab.special_unk_id
|
4052
|
-
if (vocab.special_sep_id
|
4053
|
-
if (vocab.special_pad_id
|
4054
|
-
if (vocab.
|
4356
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4357
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4358
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4359
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4360
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4361
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4362
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4363
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4055
4364
|
}
|
4056
4365
|
|
4057
4366
|
// Returns false if cancelled by progress_callback
|
@@ -4075,6 +4384,7 @@ static bool llm_load_tensors(
|
|
4075
4384
|
|
4076
4385
|
const int64_t n_layer = hparams.n_layer;
|
4077
4386
|
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
4387
|
+
bool use_mmap_buffer = true;
|
4078
4388
|
|
4079
4389
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
4080
4390
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
@@ -4163,6 +4473,10 @@ static bool llm_load_tensors(
|
|
4163
4473
|
|
4164
4474
|
// create one context per buffer type
|
4165
4475
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4476
|
+
|
4477
|
+
// for moe merged tensors
|
4478
|
+
ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
|
4479
|
+
|
4166
4480
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4167
4481
|
for (auto & it : buft_layer_count) {
|
4168
4482
|
struct ggml_init_params params = {
|
@@ -4189,6 +4503,11 @@ static bool llm_load_tensors(
|
|
4189
4503
|
const int64_t n_vocab = hparams.n_vocab;
|
4190
4504
|
const int64_t n_vocab_type = hparams.n_vocab_type;
|
4191
4505
|
const int64_t n_ff = hparams.n_ff;
|
4506
|
+
const int64_t n_expert = hparams.n_expert;
|
4507
|
+
|
4508
|
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
4509
|
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
4510
|
+
}
|
4192
4511
|
|
4193
4512
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
4194
4513
|
|
@@ -4243,26 +4562,113 @@ static bool llm_load_tensors(
|
|
4243
4562
|
|
4244
4563
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4245
4564
|
|
4246
|
-
|
4247
|
-
|
4248
|
-
if (layer.ffn_gate_inp == nullptr) {
|
4249
|
-
GGML_ASSERT(hparams.n_expert == 0);
|
4250
|
-
GGML_ASSERT(hparams.n_expert_used == 0);
|
4251
|
-
|
4565
|
+
if (n_expert == 0) {
|
4252
4566
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4253
4567
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4254
4568
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4255
4569
|
} else {
|
4256
|
-
|
4257
|
-
|
4258
|
-
|
4259
|
-
|
4260
|
-
|
4261
|
-
layer.
|
4262
|
-
|
4263
|
-
|
4570
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4571
|
+
|
4572
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4573
|
+
if (layer.ffn_gate_exps) {
|
4574
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4575
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4576
|
+
} else {
|
4577
|
+
// merge split expert into a single tensor for compatibility with older models
|
4578
|
+
// requires disabling mmap
|
4579
|
+
use_mmap_buffer = false;
|
4580
|
+
|
4581
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4582
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4583
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4584
|
+
|
4585
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4586
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4587
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4588
|
+
|
4589
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4590
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4591
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4592
|
+
|
4593
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4594
|
+
// the individual experts are loaded into a view of the merged tensor
|
4595
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4596
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4597
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4598
|
+
}
|
4599
|
+
}
|
4600
|
+
}
|
4601
|
+
}
|
4602
|
+
} break;
|
4603
|
+
case LLM_ARCH_GROK:
|
4604
|
+
{
|
4605
|
+
if (n_expert == 0) {
|
4606
|
+
throw std::runtime_error("Grok model cannot have zero experts");
|
4607
|
+
}
|
4608
|
+
|
4609
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4610
|
+
|
4611
|
+
// output
|
4612
|
+
{
|
4613
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4614
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4615
|
+
// if output is NULL, init from the input tok embed
|
4616
|
+
if (model.output == NULL) {
|
4617
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4618
|
+
ml.n_created--; // artificial tensor
|
4619
|
+
ml.size_data += ggml_nbytes(model.output);
|
4620
|
+
}
|
4621
|
+
}
|
4622
|
+
|
4623
|
+
for (int i = 0; i < n_layer; ++i) {
|
4624
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4625
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4626
|
+
|
4627
|
+
auto & layer = model.layers[i];
|
4628
|
+
|
4629
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4630
|
+
|
4631
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4632
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4633
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4634
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4635
|
+
|
4636
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4637
|
+
|
4638
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4639
|
+
|
4640
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4641
|
+
|
4642
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4643
|
+
if (layer.ffn_gate_exps) {
|
4644
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4645
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4646
|
+
} else {
|
4647
|
+
// merge split expert into a single tensor for compatibility with older models
|
4648
|
+
// requires disabling mmap
|
4649
|
+
use_mmap_buffer = false;
|
4650
|
+
|
4651
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4652
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4653
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4654
|
+
|
4655
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4656
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4657
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4658
|
+
|
4659
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4660
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4661
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4662
|
+
|
4663
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4664
|
+
// the individual experts are loaded into a view of the merged tensor
|
4665
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4666
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4667
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4264
4668
|
}
|
4265
4669
|
}
|
4670
|
+
|
4671
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4266
4672
|
}
|
4267
4673
|
} break;
|
4268
4674
|
case LLM_ARCH_BAICHUAN:
|
@@ -4319,10 +4725,8 @@ static bool llm_load_tensors(
|
|
4319
4725
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4320
4726
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4321
4727
|
|
4322
|
-
|
4323
|
-
|
4324
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
4325
|
-
}
|
4728
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
|
4729
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
|
4326
4730
|
|
4327
4731
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4328
4732
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -4502,6 +4906,7 @@ static bool llm_load_tensors(
|
|
4502
4906
|
case LLM_ARCH_MPT:
|
4503
4907
|
{
|
4504
4908
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4909
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
|
4505
4910
|
|
4506
4911
|
// output
|
4507
4912
|
{
|
@@ -4540,6 +4945,12 @@ static bool llm_load_tensors(
|
|
4540
4945
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4541
4946
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
4542
4947
|
|
4948
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
4949
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
4950
|
+
|
4951
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
4952
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
4953
|
+
|
4543
4954
|
// AWQ ScaleActivation layer
|
4544
4955
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
4545
4956
|
}
|
@@ -4986,6 +5397,28 @@ static bool llm_load_tensors(
|
|
4986
5397
|
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
4987
5398
|
}
|
4988
5399
|
} break;
|
5400
|
+
case LLM_ARCH_XVERSE:
|
5401
|
+
{
|
5402
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5403
|
+
{
|
5404
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5405
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5406
|
+
}
|
5407
|
+
for (int i = 0; i < n_layer; ++i) {
|
5408
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5409
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5410
|
+
auto & layer = model.layers[i];
|
5411
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5412
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5413
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5414
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5415
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5416
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5417
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5418
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5419
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5420
|
+
}
|
5421
|
+
} break;
|
4989
5422
|
case LLM_ARCH_COMMAND_R:
|
4990
5423
|
{
|
4991
5424
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5007,6 +5440,11 @@ static bool llm_load_tensors(
|
|
5007
5440
|
|
5008
5441
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5009
5442
|
|
5443
|
+
if (n_layer >= 64){
|
5444
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
5445
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
5446
|
+
}
|
5447
|
+
|
5010
5448
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5011
5449
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5012
5450
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
@@ -5024,56 +5462,97 @@ static bool llm_load_tensors(
|
|
5024
5462
|
|
5025
5463
|
ml.done_getting_tensors();
|
5026
5464
|
|
5027
|
-
ml.
|
5465
|
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
5466
|
+
model.mappings.reserve(ml.mappings.size());
|
5028
5467
|
|
5029
5468
|
// create the backend buffers
|
5030
|
-
std::vector<std::pair<ggml_context *,
|
5469
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
5470
|
+
ctx_bufs.reserve(ctx_map.size());
|
5471
|
+
|
5472
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
5473
|
+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
5474
|
+
model.bufs.reserve(n_max_backend_buffer);
|
5031
5475
|
|
5032
5476
|
for (auto & it : ctx_map) {
|
5033
5477
|
ggml_backend_buffer_type_t buft = it.first;
|
5034
|
-
ggml_context * ctx
|
5035
|
-
|
5478
|
+
ggml_context * ctx = it.second;
|
5479
|
+
|
5480
|
+
llama_buf_map bufs;
|
5481
|
+
bufs.reserve(n_max_backend_buffer);
|
5036
5482
|
|
5037
5483
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
5038
5484
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
5039
5485
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
5040
|
-
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
5041
|
-
|
5042
|
-
|
5043
|
-
|
5044
|
-
|
5045
|
-
|
5046
|
-
|
5486
|
+
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
5487
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5488
|
+
void * addr = nullptr;
|
5489
|
+
size_t first, last;
|
5490
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5491
|
+
if (first >= last) {
|
5492
|
+
continue;
|
5493
|
+
}
|
5494
|
+
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
|
5495
|
+
if (buf == nullptr) {
|
5496
|
+
throw std::runtime_error("unable to allocate backend CPU buffer");
|
5497
|
+
}
|
5498
|
+
model.bufs.push_back(buf);
|
5499
|
+
bufs.emplace(idx, buf);
|
5500
|
+
#ifdef GGML_USE_CUDA
|
5501
|
+
if (n_layer >= n_gpu_layers) {
|
5502
|
+
ggml_backend_cuda_register_host_buffer(
|
5047
5503
|
ggml_backend_buffer_get_base(buf),
|
5048
5504
|
ggml_backend_buffer_get_size(buf));
|
5049
|
-
|
5505
|
+
}
|
5050
5506
|
#endif
|
5507
|
+
}
|
5051
5508
|
}
|
5052
5509
|
#ifdef GGML_USE_METAL
|
5053
|
-
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
5054
|
-
|
5055
|
-
|
5056
|
-
|
5057
|
-
|
5510
|
+
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
5511
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5512
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
5513
|
+
void * addr = nullptr;
|
5514
|
+
size_t first, last;
|
5515
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5516
|
+
if (first >= last) {
|
5517
|
+
continue;
|
5518
|
+
}
|
5519
|
+
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
|
5520
|
+
if (buf == nullptr) {
|
5521
|
+
throw std::runtime_error("unable to allocate backend metal buffer");
|
5522
|
+
}
|
5523
|
+
model.bufs.push_back(buf);
|
5524
|
+
bufs.emplace(idx, buf);
|
5525
|
+
}
|
5058
5526
|
}
|
5059
5527
|
#endif
|
5060
5528
|
else {
|
5061
|
-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5062
|
-
if (buf
|
5529
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5530
|
+
if (buf == nullptr) {
|
5531
|
+
throw std::runtime_error("unable to allocate backend buffer");
|
5532
|
+
}
|
5533
|
+
model.bufs.push_back(buf);
|
5534
|
+
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
5063
5535
|
model.mlock_bufs.emplace_back(new llama_mlock);
|
5064
5536
|
auto & mlock_buf = model.mlock_bufs.back();
|
5065
5537
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
5066
5538
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
5067
5539
|
}
|
5540
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5541
|
+
bufs.emplace(idx, buf);
|
5542
|
+
}
|
5068
5543
|
}
|
5069
|
-
|
5544
|
+
|
5545
|
+
if (bufs.empty()) {
|
5070
5546
|
throw std::runtime_error("failed to allocate buffer");
|
5071
5547
|
}
|
5072
|
-
|
5073
|
-
|
5074
|
-
|
5075
|
-
|
5076
|
-
|
5548
|
+
|
5549
|
+
for (auto & buf : bufs) {
|
5550
|
+
// indicate that this buffer contains weights
|
5551
|
+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
5552
|
+
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
5553
|
+
}
|
5554
|
+
|
5555
|
+
ctx_bufs.emplace_back(ctx, bufs);
|
5077
5556
|
}
|
5078
5557
|
|
5079
5558
|
if (llama_supports_gpu_offload()) {
|
@@ -5105,13 +5584,17 @@ static bool llm_load_tensors(
|
|
5105
5584
|
// load tensor data
|
5106
5585
|
for (auto & it : ctx_bufs) {
|
5107
5586
|
ggml_context * ctx = it.first;
|
5108
|
-
|
5109
|
-
if (!ml.load_all_data(ctx,
|
5587
|
+
auto & bufs = it.second;
|
5588
|
+
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
5110
5589
|
return false;
|
5111
5590
|
}
|
5112
5591
|
}
|
5113
5592
|
|
5114
|
-
|
5593
|
+
if (use_mmap_buffer) {
|
5594
|
+
for (auto & mapping : ml.mappings) {
|
5595
|
+
model.mappings.emplace_back(std::move(mapping));
|
5596
|
+
}
|
5597
|
+
}
|
5115
5598
|
|
5116
5599
|
// loading time will be recalculate after the first eval, so
|
5117
5600
|
// we take page faults deferred by mmap() into consideration
|
@@ -5266,8 +5749,8 @@ static void llm_build_kv_store(
|
|
5266
5749
|
GGML_ASSERT(kv.size == n_ctx);
|
5267
5750
|
|
5268
5751
|
// compute the transposed [n_tokens, n_embd] V matrix
|
5269
|
-
|
5270
|
-
|
5752
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
5753
|
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
5271
5754
|
cb(v_cur_t, "v_cur_t", il);
|
5272
5755
|
|
5273
5756
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
@@ -5451,6 +5934,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
5451
5934
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5452
5935
|
}
|
5453
5936
|
|
5937
|
+
if (model.arch == LLM_ARCH_GROK) {
|
5938
|
+
// need to do the following:
|
5939
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
5940
|
+
// and then :
|
5941
|
+
// kq = 30 * tanh(kq / 30)
|
5942
|
+
// before the softmax below
|
5943
|
+
|
5944
|
+
//try from phi2
|
5945
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5946
|
+
|
5947
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
5948
|
+
kq = ggml_scale(ctx, kq, 30);
|
5949
|
+
}
|
5950
|
+
|
5454
5951
|
#if defined(GGML_USE_KOMPUTE)
|
5455
5952
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
5456
5953
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
@@ -5577,7 +6074,8 @@ struct llm_build_context {
|
|
5577
6074
|
const float norm_rms_eps;
|
5578
6075
|
|
5579
6076
|
const int32_t n_tokens;
|
5580
|
-
const int32_t n_kv; // size of KV cache to consider (n_kv <=
|
6077
|
+
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
6078
|
+
const int32_t n_outputs;
|
5581
6079
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5582
6080
|
const int32_t n_orig_ctx;
|
5583
6081
|
|
@@ -5624,6 +6122,7 @@ struct llm_build_context {
|
|
5624
6122
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
5625
6123
|
n_tokens (batch.n_tokens),
|
5626
6124
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
6125
|
+
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
5627
6126
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
5628
6127
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5629
6128
|
pooling_type (cparams.pooling_type),
|
@@ -5645,6 +6144,7 @@ struct llm_build_context {
|
|
5645
6144
|
lctx.inp_tokens = nullptr;
|
5646
6145
|
lctx.inp_embd = nullptr;
|
5647
6146
|
lctx.inp_pos = nullptr;
|
6147
|
+
lctx.inp_out_ids = nullptr;
|
5648
6148
|
lctx.inp_KQ_mask = nullptr;
|
5649
6149
|
lctx.inp_KQ_pos = nullptr;
|
5650
6150
|
lctx.inp_K_shift = nullptr;
|
@@ -5768,6 +6268,13 @@ struct llm_build_context {
|
|
5768
6268
|
return lctx.inp_pos;
|
5769
6269
|
}
|
5770
6270
|
|
6271
|
+
struct ggml_tensor * build_inp_out_ids() {
|
6272
|
+
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
6273
|
+
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
6274
|
+
ggml_set_input(lctx.inp_out_ids);
|
6275
|
+
return lctx.inp_out_ids;
|
6276
|
+
}
|
6277
|
+
|
5771
6278
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
5772
6279
|
if (causal) {
|
5773
6280
|
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
|
@@ -5824,6 +6331,9 @@ struct llm_build_context {
|
|
5824
6331
|
struct ggml_cgraph * build_llama() {
|
5825
6332
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5826
6333
|
|
6334
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6335
|
+
int32_t n_tokens = this->n_tokens;
|
6336
|
+
|
5827
6337
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5828
6338
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5829
6339
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
@@ -5891,6 +6401,14 @@ struct llm_build_context {
|
|
5891
6401
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5892
6402
|
}
|
5893
6403
|
|
6404
|
+
if (il == n_layer - 1) {
|
6405
|
+
// skip computing output for unused tokens
|
6406
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6407
|
+
n_tokens = n_outputs;
|
6408
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6409
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6410
|
+
}
|
6411
|
+
|
5894
6412
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5895
6413
|
cb(ffn_inp, "ffn_inp", il);
|
5896
6414
|
|
@@ -5943,19 +6461,19 @@ struct llm_build_context {
|
|
5943
6461
|
for (int i = 0; i < n_expert_used; ++i) {
|
5944
6462
|
ggml_tensor * cur_expert;
|
5945
6463
|
|
5946
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].
|
6464
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
5947
6465
|
cb(cur_up, "ffn_moe_up", il);
|
5948
6466
|
|
5949
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].
|
6467
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
5950
6468
|
cb(cur_gate, "ffn_moe_gate", il);
|
5951
6469
|
|
5952
6470
|
cur_gate = ggml_silu(ctx0, cur_gate);
|
5953
6471
|
cb(cur_gate, "ffn_moe_silu", il);
|
5954
6472
|
|
5955
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6473
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
5956
6474
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
5957
6475
|
|
5958
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].
|
6476
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
5959
6477
|
cb(cur_expert, "ffn_moe_down", il);
|
5960
6478
|
|
5961
6479
|
cur_expert = ggml_mul(ctx0, cur_expert,
|
@@ -6070,6 +6588,13 @@ struct llm_build_context {
|
|
6070
6588
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6071
6589
|
}
|
6072
6590
|
|
6591
|
+
if (il == n_layer - 1) {
|
6592
|
+
// skip computing output for unused tokens
|
6593
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6594
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6595
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6596
|
+
}
|
6597
|
+
|
6073
6598
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6074
6599
|
cb(ffn_inp, "ffn_inp", il);
|
6075
6600
|
|
@@ -6112,6 +6637,111 @@ struct llm_build_context {
|
|
6112
6637
|
return gf;
|
6113
6638
|
}
|
6114
6639
|
|
6640
|
+
struct ggml_cgraph * build_xverse() {
|
6641
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6642
|
+
|
6643
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6644
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6645
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6646
|
+
|
6647
|
+
struct ggml_tensor * cur;
|
6648
|
+
struct ggml_tensor * inpL;
|
6649
|
+
|
6650
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6651
|
+
|
6652
|
+
// inp_pos - contains the positions
|
6653
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6654
|
+
|
6655
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6656
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6657
|
+
|
6658
|
+
// positions of the tokens in the KV cache
|
6659
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6660
|
+
|
6661
|
+
for (int il = 0; il < n_layer; ++il) {
|
6662
|
+
struct ggml_tensor * inpSA = inpL;
|
6663
|
+
|
6664
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6665
|
+
model.layers[il].attn_norm, NULL,
|
6666
|
+
LLM_NORM_RMS, cb, il);
|
6667
|
+
cb(cur, "attn_norm", il);
|
6668
|
+
|
6669
|
+
// self-attention
|
6670
|
+
{
|
6671
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6672
|
+
cb(Qcur, "Qcur", il);
|
6673
|
+
|
6674
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6675
|
+
cb(Kcur, "Kcur", il);
|
6676
|
+
|
6677
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6678
|
+
cb(Vcur, "Vcur", il);
|
6679
|
+
|
6680
|
+
Qcur = ggml_rope_custom(
|
6681
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6682
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6683
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6684
|
+
);
|
6685
|
+
cb(Qcur, "Qcur", il);
|
6686
|
+
|
6687
|
+
Kcur = ggml_rope_custom(
|
6688
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6689
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6690
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6691
|
+
);
|
6692
|
+
cb(Kcur, "Kcur", il);
|
6693
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6694
|
+
model.layers[il].wo, NULL,
|
6695
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6696
|
+
}
|
6697
|
+
|
6698
|
+
if (il == n_layer - 1) {
|
6699
|
+
// skip computing output for unused tokens
|
6700
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6701
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6702
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6703
|
+
}
|
6704
|
+
|
6705
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6706
|
+
cb(ffn_inp, "ffn_inp", il);
|
6707
|
+
|
6708
|
+
// feed-forward network
|
6709
|
+
{
|
6710
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6711
|
+
model.layers[il].ffn_norm, NULL,
|
6712
|
+
LLM_NORM_RMS, cb, il);
|
6713
|
+
cb(cur, "ffn_norm", il);
|
6714
|
+
|
6715
|
+
cur = llm_build_ffn(ctx0, cur,
|
6716
|
+
model.layers[il].ffn_up, NULL,
|
6717
|
+
model.layers[il].ffn_gate, NULL,
|
6718
|
+
model.layers[il].ffn_down, NULL,
|
6719
|
+
NULL,
|
6720
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6721
|
+
cb(cur, "ffn_out", il);
|
6722
|
+
}
|
6723
|
+
|
6724
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6725
|
+
cb(cur, "l_out", il);
|
6726
|
+
|
6727
|
+
// input for next layer
|
6728
|
+
inpL = cur;
|
6729
|
+
}
|
6730
|
+
|
6731
|
+
cur = inpL;
|
6732
|
+
|
6733
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
6734
|
+
cb(cur, "result_norm", -1);
|
6735
|
+
|
6736
|
+
// lm_head
|
6737
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6738
|
+
cb(cur, "result_output", -1);
|
6739
|
+
|
6740
|
+
ggml_build_forward_expand(gf, cur);
|
6741
|
+
|
6742
|
+
return gf;
|
6743
|
+
}
|
6744
|
+
|
6115
6745
|
struct ggml_cgraph * build_falcon() {
|
6116
6746
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6117
6747
|
|
@@ -6185,6 +6815,14 @@ struct llm_build_context {
|
|
6185
6815
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6186
6816
|
}
|
6187
6817
|
|
6818
|
+
if (il == n_layer - 1) {
|
6819
|
+
// skip computing output for unused tokens
|
6820
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6821
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6822
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6823
|
+
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
6824
|
+
}
|
6825
|
+
|
6188
6826
|
struct ggml_tensor * ffn_inp = cur;
|
6189
6827
|
|
6190
6828
|
// feed forward
|
@@ -6225,144 +6863,359 @@ struct llm_build_context {
|
|
6225
6863
|
return gf;
|
6226
6864
|
}
|
6227
6865
|
|
6228
|
-
struct ggml_cgraph *
|
6866
|
+
struct ggml_cgraph * build_grok() {
|
6229
6867
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6230
6868
|
|
6869
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6870
|
+
int32_t n_tokens = this->n_tokens;
|
6871
|
+
|
6231
6872
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6232
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
6233
6873
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6874
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6234
6875
|
|
6235
6876
|
struct ggml_tensor * cur;
|
6236
6877
|
struct ggml_tensor * inpL;
|
6237
6878
|
|
6238
6879
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6239
6880
|
|
6881
|
+
// multiply by embedding_multiplier_scale of 78.38367176906169
|
6882
|
+
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
|
6883
|
+
|
6240
6884
|
// inp_pos - contains the positions
|
6241
6885
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
6242
6886
|
|
6243
6887
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6244
6888
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6245
6889
|
|
6246
|
-
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
6247
|
-
cb(pos, "pos_embd", -1);
|
6248
|
-
|
6249
|
-
inpL = ggml_add(ctx0, inpL, pos);
|
6250
|
-
cb(inpL, "inpL", -1);
|
6251
|
-
|
6252
6890
|
for (int il = 0; il < n_layer; ++il) {
|
6891
|
+
struct ggml_tensor * inpSA = inpL;
|
6892
|
+
|
6893
|
+
// norm
|
6253
6894
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
6254
|
-
model.layers[il].attn_norm,
|
6255
|
-
|
6256
|
-
LLM_NORM, cb, il);
|
6895
|
+
model.layers[il].attn_norm, NULL,
|
6896
|
+
LLM_NORM_RMS, cb, il);
|
6257
6897
|
cb(cur, "attn_norm", il);
|
6258
6898
|
|
6899
|
+
|
6259
6900
|
// self-attention
|
6260
6901
|
{
|
6261
|
-
|
6262
|
-
|
6902
|
+
// compute Q and K and RoPE them
|
6903
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6904
|
+
cb(Qcur, "Qcur", il);
|
6905
|
+
if (model.layers[il].bq) {
|
6906
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6907
|
+
cb(Qcur, "Qcur", il);
|
6908
|
+
}
|
6263
6909
|
|
6264
|
-
|
6265
|
-
cb(
|
6910
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6911
|
+
cb(Kcur, "Kcur", il);
|
6912
|
+
if (model.layers[il].bk) {
|
6913
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6914
|
+
cb(Kcur, "Kcur", il);
|
6915
|
+
}
|
6266
6916
|
|
6267
|
-
struct ggml_tensor *
|
6268
|
-
|
6269
|
-
|
6917
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6918
|
+
cb(Vcur, "Vcur", il);
|
6919
|
+
if (model.layers[il].bv) {
|
6920
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6921
|
+
cb(Vcur, "Vcur", il);
|
6922
|
+
}
|
6270
6923
|
|
6924
|
+
Qcur = ggml_rope_custom(
|
6925
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6926
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6927
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6928
|
+
);
|
6271
6929
|
cb(Qcur, "Qcur", il);
|
6272
|
-
cb(Kcur, "Kcur", il);
|
6273
|
-
cb(Vcur, "Vcur", il);
|
6274
6930
|
|
6275
|
-
|
6931
|
+
Kcur = ggml_rope_custom(
|
6932
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6933
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6934
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6935
|
+
);
|
6936
|
+
cb(Kcur, "Kcur", il);
|
6276
6937
|
|
6277
6938
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6278
6939
|
model.layers[il].wo, model.layers[il].bo,
|
6279
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f
|
6940
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6280
6941
|
}
|
6281
6942
|
|
6282
|
-
|
6283
|
-
|
6284
|
-
|
6285
|
-
|
6286
|
-
|
6287
|
-
|
6288
|
-
|
6289
|
-
model.layers[il].ffn_norm,
|
6290
|
-
model.layers[il].ffn_norm_b,
|
6291
|
-
LLM_NORM, cb, il);
|
6292
|
-
cb(cur, "ffn_norm", il);
|
6943
|
+
if (il == n_layer - 1) {
|
6944
|
+
// skip computing output for unused tokens
|
6945
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6946
|
+
n_tokens = n_outputs;
|
6947
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6948
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6949
|
+
}
|
6293
6950
|
|
6294
|
-
|
6295
|
-
|
6296
|
-
|
6297
|
-
|
6298
|
-
NULL,
|
6299
|
-
|
6300
|
-
cb(cur, "
|
6951
|
+
// Grok
|
6952
|
+
// if attn_out_norm is present then apply it before adding the input
|
6953
|
+
if (model.layers[il].attn_out_norm) {
|
6954
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6955
|
+
model.layers[il].attn_out_norm, NULL,
|
6956
|
+
LLM_NORM_RMS, cb, il);
|
6957
|
+
cb(cur, "attn_out_norm", il);
|
6301
6958
|
}
|
6302
6959
|
|
6303
|
-
|
6304
|
-
cb(
|
6305
|
-
}
|
6960
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6961
|
+
cb(ffn_inp, "ffn_inp", il);
|
6306
6962
|
|
6307
|
-
|
6308
|
-
|
6309
|
-
|
6310
|
-
|
6311
|
-
|
6963
|
+
// feed-forward network
|
6964
|
+
// MoE branch
|
6965
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6966
|
+
model.layers[il].ffn_norm, NULL,
|
6967
|
+
LLM_NORM_RMS, cb, il);
|
6968
|
+
cb(cur, "ffn_norm", il);
|
6312
6969
|
|
6313
|
-
|
6314
|
-
|
6970
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
6971
|
+
cb(logits, "ffn_moe_logits", il);
|
6315
6972
|
|
6316
|
-
|
6973
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
6974
|
+
cb(probs, "ffn_moe_probs", il);
|
6317
6975
|
|
6318
|
-
|
6319
|
-
|
6976
|
+
// select experts
|
6977
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
6978
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6320
6979
|
|
6321
|
-
|
6322
|
-
|
6980
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6981
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6982
|
+
cb(weights, "ffn_moe_weights", il);
|
6323
6983
|
|
6324
|
-
|
6325
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6326
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
6984
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6327
6985
|
|
6328
|
-
|
6329
|
-
|
6986
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6987
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6330
6988
|
|
6331
|
-
|
6989
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6990
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6332
6991
|
|
6333
|
-
|
6334
|
-
|
6992
|
+
// compute expert outputs
|
6993
|
+
ggml_tensor * moe_out = nullptr;
|
6335
6994
|
|
6336
|
-
|
6337
|
-
|
6995
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6996
|
+
ggml_tensor * cur_expert;
|
6338
6997
|
|
6339
|
-
|
6340
|
-
|
6998
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6999
|
+
cb(cur_up, "ffn_moe_up", il);
|
6341
7000
|
|
6342
|
-
|
6343
|
-
|
6344
|
-
model.layers[il].attn_norm_b,
|
6345
|
-
LLM_NORM, cb, il);
|
6346
|
-
cb(cur, "attn_norm", il);
|
7001
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
7002
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
6347
7003
|
|
6348
|
-
|
6349
|
-
|
6350
|
-
|
6351
|
-
cb(cur, "wqkv", il);
|
7004
|
+
//GeLU
|
7005
|
+
cur_gate = ggml_gelu(ctx0, cur_gate);
|
7006
|
+
cb(cur_gate, "ffn_moe_gelu", il);
|
6352
7007
|
|
6353
|
-
|
6354
|
-
cb(
|
7008
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
7009
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
6355
7010
|
|
6356
|
-
//
|
6357
|
-
|
7011
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
7012
|
+
cb(cur_expert, "ffn_moe_down", il);
|
6358
7013
|
|
6359
|
-
|
6360
|
-
|
7014
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
7015
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
7016
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
6361
7017
|
|
6362
|
-
|
6363
|
-
|
7018
|
+
if (i == 0) {
|
7019
|
+
moe_out = cur_expert;
|
7020
|
+
} else {
|
7021
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
7022
|
+
cb(moe_out, "ffn_moe_out", il);
|
7023
|
+
}
|
7024
|
+
}
|
6364
7025
|
|
6365
|
-
|
7026
|
+
cur = moe_out;
|
7027
|
+
|
7028
|
+
// Grok
|
7029
|
+
// if layer_out_norm is present then apply it before adding the input
|
7030
|
+
// Idea: maybe ffn_out_norm is a better name
|
7031
|
+
if (model.layers[il].layer_out_norm) {
|
7032
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7033
|
+
model.layers[il].layer_out_norm, NULL,
|
7034
|
+
LLM_NORM_RMS, cb, il);
|
7035
|
+
cb(cur, "layer_out_norm", il);
|
7036
|
+
}
|
7037
|
+
|
7038
|
+
|
7039
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7040
|
+
cb(cur, "ffn_out", il);
|
7041
|
+
|
7042
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7043
|
+
if (layer_dir != nullptr) {
|
7044
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7045
|
+
}
|
7046
|
+
cb(cur, "l_out", il);
|
7047
|
+
|
7048
|
+
// input for next layer
|
7049
|
+
inpL = cur;
|
7050
|
+
}
|
7051
|
+
|
7052
|
+
cur = inpL;
|
7053
|
+
|
7054
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7055
|
+
model.output_norm, NULL,
|
7056
|
+
LLM_NORM_RMS, cb, -1);
|
7057
|
+
cb(cur, "result_norm", -1);
|
7058
|
+
|
7059
|
+
// lm_head
|
7060
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7061
|
+
|
7062
|
+
// Grok
|
7063
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7064
|
+
|
7065
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7066
|
+
|
7067
|
+
cb(cur, "result_output", -1);
|
7068
|
+
|
7069
|
+
ggml_build_forward_expand(gf, cur);
|
7070
|
+
|
7071
|
+
return gf;
|
7072
|
+
}
|
7073
|
+
|
7074
|
+
struct ggml_cgraph * build_starcoder() {
|
7075
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7076
|
+
|
7077
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7078
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7079
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7080
|
+
|
7081
|
+
struct ggml_tensor * cur;
|
7082
|
+
struct ggml_tensor * inpL;
|
7083
|
+
|
7084
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7085
|
+
|
7086
|
+
// inp_pos - contains the positions
|
7087
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7088
|
+
|
7089
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7090
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7091
|
+
|
7092
|
+
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7093
|
+
cb(pos, "pos_embd", -1);
|
7094
|
+
|
7095
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7096
|
+
cb(inpL, "inpL", -1);
|
7097
|
+
|
7098
|
+
for (int il = 0; il < n_layer; ++il) {
|
7099
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7100
|
+
model.layers[il].attn_norm,
|
7101
|
+
model.layers[il].attn_norm_b,
|
7102
|
+
LLM_NORM, cb, il);
|
7103
|
+
cb(cur, "attn_norm", il);
|
7104
|
+
|
7105
|
+
// self-attention
|
7106
|
+
{
|
7107
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7108
|
+
cb(cur, "wqkv", il);
|
7109
|
+
|
7110
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
7111
|
+
cb(cur, "bqkv", il);
|
7112
|
+
|
7113
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7114
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7115
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7116
|
+
|
7117
|
+
cb(Qcur, "Qcur", il);
|
7118
|
+
cb(Kcur, "Kcur", il);
|
7119
|
+
cb(Vcur, "Vcur", il);
|
7120
|
+
|
7121
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7122
|
+
|
7123
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7124
|
+
model.layers[il].wo, model.layers[il].bo,
|
7125
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7126
|
+
}
|
7127
|
+
|
7128
|
+
if (il == n_layer - 1) {
|
7129
|
+
// skip computing output for unused tokens
|
7130
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7131
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7132
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7133
|
+
}
|
7134
|
+
|
7135
|
+
// add the input
|
7136
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7137
|
+
cb(ffn_inp, "ffn_inp", il);
|
7138
|
+
|
7139
|
+
// FF
|
7140
|
+
{
|
7141
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7142
|
+
model.layers[il].ffn_norm,
|
7143
|
+
model.layers[il].ffn_norm_b,
|
7144
|
+
LLM_NORM, cb, il);
|
7145
|
+
cb(cur, "ffn_norm", il);
|
7146
|
+
|
7147
|
+
cur = llm_build_ffn(ctx0, cur,
|
7148
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7149
|
+
NULL, NULL,
|
7150
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7151
|
+
NULL,
|
7152
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
7153
|
+
cb(cur, "ffn_out", il);
|
7154
|
+
}
|
7155
|
+
|
7156
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
7157
|
+
cb(inpL, "l_out", il);
|
7158
|
+
}
|
7159
|
+
|
7160
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7161
|
+
model.output_norm,
|
7162
|
+
model.output_norm_b,
|
7163
|
+
LLM_NORM, cb, -1);
|
7164
|
+
cb(cur, "result_norm", -1);
|
7165
|
+
|
7166
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7167
|
+
cb(cur, "result_output", -1);
|
7168
|
+
|
7169
|
+
ggml_build_forward_expand(gf, cur);
|
7170
|
+
|
7171
|
+
return gf;
|
7172
|
+
}
|
7173
|
+
|
7174
|
+
struct ggml_cgraph * build_persimmon() {
|
7175
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7176
|
+
|
7177
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7178
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7179
|
+
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
7180
|
+
|
7181
|
+
struct ggml_tensor * cur;
|
7182
|
+
struct ggml_tensor * inpL;
|
7183
|
+
|
7184
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7185
|
+
|
7186
|
+
// inp_pos - contains the positions
|
7187
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7188
|
+
|
7189
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7190
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7191
|
+
|
7192
|
+
for (int il = 0; il < n_layer; ++il) {
|
7193
|
+
struct ggml_tensor * residual = inpL;
|
7194
|
+
|
7195
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7196
|
+
model.layers[il].attn_norm,
|
7197
|
+
model.layers[il].attn_norm_b,
|
7198
|
+
LLM_NORM, cb, il);
|
7199
|
+
cb(cur, "attn_norm", il);
|
7200
|
+
|
7201
|
+
// self attention
|
7202
|
+
{
|
7203
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7204
|
+
cb(cur, "wqkv", il);
|
7205
|
+
|
7206
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
7207
|
+
cb(cur, "bqkv", il);
|
7208
|
+
|
7209
|
+
// split qkv
|
7210
|
+
GGML_ASSERT(n_head_kv == n_head);
|
7211
|
+
|
7212
|
+
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
7213
|
+
cb(tmpqkv, "tmpqkv", il);
|
7214
|
+
|
7215
|
+
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
7216
|
+
cb(tmpqkv_perm, "tmpqkv", il);
|
7217
|
+
|
7218
|
+
struct ggml_tensor * tmpq = ggml_view_3d(
|
6366
7219
|
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
6367
7220
|
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
6368
7221
|
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
@@ -6476,6 +7329,13 @@ struct llm_build_context {
|
|
6476
7329
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6477
7330
|
}
|
6478
7331
|
|
7332
|
+
if (il == n_layer - 1) {
|
7333
|
+
// skip computing output for unused tokens
|
7334
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7335
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7336
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
7337
|
+
}
|
7338
|
+
|
6479
7339
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
6480
7340
|
cb(ffn_inp, "ffn_inp", il);
|
6481
7341
|
|
@@ -6565,6 +7425,13 @@ struct llm_build_context {
|
|
6565
7425
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6566
7426
|
}
|
6567
7427
|
|
7428
|
+
if (il == n_layer - 1) {
|
7429
|
+
// skip computing output for unused tokens
|
7430
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7431
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7432
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7433
|
+
}
|
7434
|
+
|
6568
7435
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6569
7436
|
cb(ffn_inp, "ffn_inp", il);
|
6570
7437
|
|
@@ -6722,6 +7589,13 @@ struct llm_build_context {
|
|
6722
7589
|
}
|
6723
7590
|
cb(cur, "kqv_out", il);
|
6724
7591
|
|
7592
|
+
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
7593
|
+
// skip computing output for unused tokens
|
7594
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7595
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7596
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7597
|
+
}
|
7598
|
+
|
6725
7599
|
// re-add the layer input
|
6726
7600
|
cur = ggml_add(ctx0, cur, inpL);
|
6727
7601
|
|
@@ -6844,6 +7718,13 @@ struct llm_build_context {
|
|
6844
7718
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6845
7719
|
}
|
6846
7720
|
|
7721
|
+
if (il == n_layer - 1) {
|
7722
|
+
// skip computing output for unused tokens
|
7723
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7724
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7725
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7726
|
+
}
|
7727
|
+
|
6847
7728
|
// Add the input
|
6848
7729
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6849
7730
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6891,6 +7772,7 @@ struct llm_build_context {
|
|
6891
7772
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6892
7773
|
|
6893
7774
|
struct ggml_tensor * cur;
|
7775
|
+
struct ggml_tensor * pos;
|
6894
7776
|
struct ggml_tensor * inpL;
|
6895
7777
|
|
6896
7778
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -6901,6 +7783,16 @@ struct llm_build_context {
|
|
6901
7783
|
// positions of the tokens in the KV cache
|
6902
7784
|
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6903
7785
|
|
7786
|
+
if (model.pos_embd) {
|
7787
|
+
// inp_pos - contains the positions
|
7788
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7789
|
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7790
|
+
cb(pos, "pos_embd", -1);
|
7791
|
+
|
7792
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7793
|
+
cb(inpL, "inpL", -1);
|
7794
|
+
}
|
7795
|
+
|
6904
7796
|
for (int il = 0; il < n_layer; ++il) {
|
6905
7797
|
struct ggml_tensor * attn_norm;
|
6906
7798
|
|
@@ -6935,11 +7827,39 @@ struct llm_build_context {
|
|
6935
7827
|
cb(Kcur, "Kcur", il);
|
6936
7828
|
cb(Vcur, "Vcur", il);
|
6937
7829
|
|
6938
|
-
|
7830
|
+
// Q/K Layernorm
|
7831
|
+
if (model.layers[il].attn_q_norm) {
|
7832
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
7833
|
+
model.layers[il].attn_q_norm,
|
7834
|
+
model.layers[il].attn_q_norm_b,
|
7835
|
+
LLM_NORM, cb, il);
|
7836
|
+
cb(Qcur, "Qcur", il);
|
6939
7837
|
|
6940
|
-
|
7838
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
7839
|
+
model.layers[il].attn_k_norm,
|
7840
|
+
model.layers[il].attn_k_norm_b,
|
7841
|
+
LLM_NORM, cb, il);
|
7842
|
+
cb(Kcur, "Kcur", il);
|
7843
|
+
|
7844
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7845
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7846
|
+
|
7847
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6941
7848
|
model.layers[il].wo, model.layers[il].bo,
|
6942
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7849
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7850
|
+
} else {
|
7851
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7852
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7853
|
+
model.layers[il].wo, model.layers[il].bo,
|
7854
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7855
|
+
}
|
7856
|
+
}
|
7857
|
+
|
7858
|
+
if (il == n_layer - 1) {
|
7859
|
+
// skip computing output for unused tokens
|
7860
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7861
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7862
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6943
7863
|
}
|
6944
7864
|
|
6945
7865
|
// Add the input
|
@@ -7055,6 +7975,13 @@ struct llm_build_context {
|
|
7055
7975
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7056
7976
|
}
|
7057
7977
|
|
7978
|
+
if (il == n_layer - 1) {
|
7979
|
+
// skip computing output for unused tokens
|
7980
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7981
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7982
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7983
|
+
}
|
7984
|
+
|
7058
7985
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7059
7986
|
cb(ffn_inp, "ffn_inp", il);
|
7060
7987
|
|
@@ -7161,6 +8088,13 @@ struct llm_build_context {
|
|
7161
8088
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7162
8089
|
}
|
7163
8090
|
|
8091
|
+
if (il == n_layer - 1) {
|
8092
|
+
// skip computing output for unused tokens
|
8093
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8094
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8095
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8096
|
+
}
|
8097
|
+
|
7164
8098
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7165
8099
|
cb(ffn_inp, "ffn_inp", il);
|
7166
8100
|
|
@@ -7273,6 +8207,13 @@ struct llm_build_context {
|
|
7273
8207
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7274
8208
|
}
|
7275
8209
|
|
8210
|
+
if (il == n_layer - 1) {
|
8211
|
+
// skip computing output for unused tokens
|
8212
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8213
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8214
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8215
|
+
}
|
8216
|
+
|
7276
8217
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7277
8218
|
cb(ffn_inp, "ffn_inp", il);
|
7278
8219
|
|
@@ -7391,6 +8332,14 @@ struct llm_build_context {
|
|
7391
8332
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7392
8333
|
}
|
7393
8334
|
|
8335
|
+
if (il == n_layer - 1) {
|
8336
|
+
// skip computing output for unused tokens
|
8337
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8338
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8339
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8340
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
8341
|
+
}
|
8342
|
+
|
7394
8343
|
// FF
|
7395
8344
|
{
|
7396
8345
|
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
@@ -7488,6 +8437,14 @@ struct llm_build_context {
|
|
7488
8437
|
|
7489
8438
|
cur = attention_norm;
|
7490
8439
|
|
8440
|
+
if (il == n_layer - 1) {
|
8441
|
+
// skip computing output for unused tokens
|
8442
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8443
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8444
|
+
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
|
8445
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8446
|
+
}
|
8447
|
+
|
7491
8448
|
// feed-forward network
|
7492
8449
|
{
|
7493
8450
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -7580,6 +8537,13 @@ struct llm_build_context {
|
|
7580
8537
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7581
8538
|
}
|
7582
8539
|
|
8540
|
+
if (il == n_layer - 1) {
|
8541
|
+
// skip computing output for unused tokens
|
8542
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8543
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8544
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8545
|
+
}
|
8546
|
+
|
7583
8547
|
// add the input
|
7584
8548
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7585
8549
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7680,6 +8644,13 @@ struct llm_build_context {
|
|
7680
8644
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7681
8645
|
}
|
7682
8646
|
|
8647
|
+
if (il == n_layer - 1) {
|
8648
|
+
// skip computing output for unused tokens
|
8649
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8650
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8651
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8652
|
+
}
|
8653
|
+
|
7683
8654
|
// add the input
|
7684
8655
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7685
8656
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7789,6 +8760,13 @@ struct llm_build_context {
|
|
7789
8760
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7790
8761
|
}
|
7791
8762
|
|
8763
|
+
if (il == n_layer - 1) {
|
8764
|
+
// skip computing output for unused tokens
|
8765
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8766
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8767
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8768
|
+
}
|
8769
|
+
|
7792
8770
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7793
8771
|
cb(ffn_inp, "ffn_inp", il);
|
7794
8772
|
|
@@ -7899,6 +8877,13 @@ struct llm_build_context {
|
|
7899
8877
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7900
8878
|
}
|
7901
8879
|
|
8880
|
+
if (il == n_layer - 1) {
|
8881
|
+
// skip computing output for unused tokens
|
8882
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8883
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8884
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8885
|
+
}
|
8886
|
+
|
7902
8887
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7903
8888
|
cb(ffn_inp, "ffn_inp", il);
|
7904
8889
|
|
@@ -8022,6 +9007,13 @@ struct llm_build_context {
|
|
8022
9007
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8023
9008
|
}
|
8024
9009
|
|
9010
|
+
if (il == n_layer - 1) {
|
9011
|
+
// skip computing output for unused tokens
|
9012
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9013
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9014
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
9015
|
+
}
|
9016
|
+
|
8025
9017
|
// scale_res - scale the hidden states for residual connection
|
8026
9018
|
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
8027
9019
|
cur = ggml_scale(ctx0, cur, scale_res);
|
@@ -8136,6 +9128,13 @@ struct llm_build_context {
|
|
8136
9128
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8137
9129
|
}
|
8138
9130
|
|
9131
|
+
if (il == n_layer - 1) {
|
9132
|
+
// skip computing output for unused tokens
|
9133
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9134
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9135
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9136
|
+
}
|
9137
|
+
|
8139
9138
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
8140
9139
|
cb(sa_out, "sa_out", il);
|
8141
9140
|
|
@@ -8248,6 +9247,13 @@ struct llm_build_context {
|
|
8248
9247
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8249
9248
|
}
|
8250
9249
|
|
9250
|
+
if (il == n_layer - 1) {
|
9251
|
+
// skip computing output for unused tokens
|
9252
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9253
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9254
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
9255
|
+
}
|
9256
|
+
|
8251
9257
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8252
9258
|
cb(ffn_inp, "ffn_inp", il);
|
8253
9259
|
|
@@ -8395,6 +9401,15 @@ struct llm_build_context {
|
|
8395
9401
|
|
8396
9402
|
struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
|
8397
9403
|
|
9404
|
+
if (il == n_layer - 1) {
|
9405
|
+
// skip computing output for unused tokens
|
9406
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9407
|
+
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
9408
|
+
y = ggml_get_rows(ctx0, y, inp_out_ids);
|
9409
|
+
z = ggml_get_rows(ctx0, z, inp_out_ids);
|
9410
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9411
|
+
}
|
9412
|
+
|
8398
9413
|
// {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
|
8399
9414
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
8400
9415
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
|
@@ -8478,6 +9493,31 @@ struct llm_build_context {
|
|
8478
9493
|
cb(Vcur, "Vcur", il);
|
8479
9494
|
}
|
8480
9495
|
|
9496
|
+
if (model.layers[il].attn_q_norm) {
|
9497
|
+
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
9498
|
+
ggml_element_size(Qcur) * n_embd_head,
|
9499
|
+
ggml_element_size(Qcur) * n_embd_head * n_head,
|
9500
|
+
0);
|
9501
|
+
cb(Qcur, "Qcur", il);
|
9502
|
+
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
9503
|
+
ggml_element_size(Kcur) * n_embd_head,
|
9504
|
+
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
9505
|
+
0);
|
9506
|
+
cb(Kcur, "Kcur", il);
|
9507
|
+
|
9508
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
9509
|
+
model.layers[il].attn_q_norm,
|
9510
|
+
NULL,
|
9511
|
+
LLM_NORM, cb, il);
|
9512
|
+
cb(Qcur, "Qcur", il);
|
9513
|
+
|
9514
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
9515
|
+
model.layers[il].attn_k_norm,
|
9516
|
+
NULL,
|
9517
|
+
LLM_NORM, cb, il);
|
9518
|
+
cb(Kcur, "Kcur", il);
|
9519
|
+
}
|
9520
|
+
|
8481
9521
|
Qcur = ggml_rope_custom(
|
8482
9522
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8483
9523
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -8497,6 +9537,14 @@ struct llm_build_context {
|
|
8497
9537
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8498
9538
|
}
|
8499
9539
|
|
9540
|
+
if (il == n_layer - 1) {
|
9541
|
+
// skip computing output for unused tokens
|
9542
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9543
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9544
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9545
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
9546
|
+
}
|
9547
|
+
|
8500
9548
|
struct ggml_tensor * attn_out = cur;
|
8501
9549
|
|
8502
9550
|
// feed-forward network
|
@@ -8648,6 +9696,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8648
9696
|
{
|
8649
9697
|
result = llm.build_falcon();
|
8650
9698
|
} break;
|
9699
|
+
case LLM_ARCH_GROK:
|
9700
|
+
{
|
9701
|
+
result = llm.build_grok();
|
9702
|
+
} break;
|
8651
9703
|
case LLM_ARCH_STARCODER:
|
8652
9704
|
{
|
8653
9705
|
result = llm.build_starcoder();
|
@@ -8725,6 +9777,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8725
9777
|
{
|
8726
9778
|
result = llm.build_mamba();
|
8727
9779
|
} break;
|
9780
|
+
case LLM_ARCH_XVERSE:
|
9781
|
+
{
|
9782
|
+
result = llm.build_xverse();
|
9783
|
+
} break;
|
8728
9784
|
case LLM_ARCH_COMMAND_R:
|
8729
9785
|
{
|
8730
9786
|
result = llm.build_command_r();
|
@@ -8790,9 +9846,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8790
9846
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
8791
9847
|
}
|
8792
9848
|
|
9849
|
+
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
9850
|
+
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
9851
|
+
const int64_t n_tokens = batch.n_tokens;
|
9852
|
+
|
9853
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
9854
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
9855
|
+
|
9856
|
+
if (lctx.n_outputs == n_tokens) {
|
9857
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9858
|
+
data[i] = i;
|
9859
|
+
}
|
9860
|
+
} else if (batch.logits) {
|
9861
|
+
int32_t n_outputs = 0;
|
9862
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9863
|
+
if (batch.logits[i]) {
|
9864
|
+
data[n_outputs++] = i;
|
9865
|
+
}
|
9866
|
+
}
|
9867
|
+
// the graph needs to have been passed the correct number of outputs
|
9868
|
+
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
9869
|
+
} else if (lctx.n_outputs == 1) {
|
9870
|
+
// only keep last output
|
9871
|
+
data[0] = n_tokens - 1;
|
9872
|
+
} else {
|
9873
|
+
GGML_ASSERT(lctx.n_outputs == 0);
|
9874
|
+
}
|
9875
|
+
}
|
9876
|
+
|
8793
9877
|
GGML_ASSERT(
|
9878
|
+
// (!a || b) is a logical implication (a -> b)
|
9879
|
+
// !hparams.causal_attn -> !cparams.causal_attn
|
8794
9880
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
8795
|
-
"
|
9881
|
+
"causal attention with embedding models is not supported"
|
8796
9882
|
);
|
8797
9883
|
|
8798
9884
|
if (lctx.inp_KQ_mask) {
|
@@ -8971,7 +10057,75 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8971
10057
|
}
|
8972
10058
|
}
|
8973
10059
|
|
8974
|
-
|
10060
|
+
// Make sure enough space is available for outputs.
|
10061
|
+
// Returns max number of outputs for which space was reserved.
|
10062
|
+
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
10063
|
+
const auto & cparams = lctx.cparams;
|
10064
|
+
const auto & hparams = lctx.model.hparams;
|
10065
|
+
|
10066
|
+
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
10067
|
+
|
10068
|
+
const auto n_batch = cparams.n_batch;
|
10069
|
+
const auto n_vocab = hparams.n_vocab;
|
10070
|
+
const auto n_embd = hparams.n_embd;
|
10071
|
+
|
10072
|
+
// TODO: use a per-batch flag for logits presence instead
|
10073
|
+
const bool has_logits = cparams.causal_attn;
|
10074
|
+
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
10075
|
+
|
10076
|
+
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
10077
|
+
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
10078
|
+
|
10079
|
+
if (lctx.output_ids.empty()) {
|
10080
|
+
// init, never resized afterwards
|
10081
|
+
lctx.output_ids.resize(n_batch);
|
10082
|
+
}
|
10083
|
+
|
10084
|
+
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
|
10085
|
+
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
10086
|
+
|
10087
|
+
// alloc only when more than the current capacity is required
|
10088
|
+
// TODO: also consider shrinking the buffer
|
10089
|
+
if (!lctx.buf_output || prev_size < new_size) {
|
10090
|
+
if (lctx.buf_output) {
|
10091
|
+
#ifndef NDEBUG
|
10092
|
+
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
10093
|
+
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
10094
|
+
#endif
|
10095
|
+
ggml_backend_buffer_free(lctx.buf_output);
|
10096
|
+
lctx.buf_output = nullptr;
|
10097
|
+
lctx.logits = nullptr;
|
10098
|
+
lctx.embd = nullptr;
|
10099
|
+
}
|
10100
|
+
|
10101
|
+
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
10102
|
+
if (lctx.buf_output == nullptr) {
|
10103
|
+
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
10104
|
+
return 0;
|
10105
|
+
}
|
10106
|
+
}
|
10107
|
+
|
10108
|
+
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
|
10109
|
+
|
10110
|
+
lctx.logits = has_logits ? output_base : nullptr;
|
10111
|
+
lctx.embd = has_embd ? output_base + logits_size : nullptr;
|
10112
|
+
|
10113
|
+
lctx.output_size = n_outputs_max;
|
10114
|
+
lctx.logits_size = logits_size;
|
10115
|
+
lctx.embd_size = embd_size;
|
10116
|
+
|
10117
|
+
// set all ids as invalid (negative)
|
10118
|
+
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
|
10119
|
+
|
10120
|
+
ggml_backend_buffer_clear(lctx.buf_output, 0);
|
10121
|
+
|
10122
|
+
lctx.n_outputs = 0;
|
10123
|
+
|
10124
|
+
return n_outputs_max;
|
10125
|
+
}
|
10126
|
+
|
10127
|
+
|
10128
|
+
static void llama_graph_compute(
|
8975
10129
|
llama_context & lctx,
|
8976
10130
|
ggml_cgraph * gf,
|
8977
10131
|
int n_threads) {
|
@@ -9046,16 +10200,8 @@ static int llama_decode_internal(
|
|
9046
10200
|
const int64_t n_embd = hparams.n_embd;
|
9047
10201
|
const int64_t n_vocab = hparams.n_vocab;
|
9048
10202
|
|
9049
|
-
|
9050
|
-
|
9051
|
-
|
9052
|
-
#ifndef NDEBUG
|
9053
|
-
auto & logits_valid = lctx.logits_valid;
|
9054
|
-
logits_valid.clear();
|
9055
|
-
logits_valid.resize(n_tokens_all);
|
9056
|
-
|
9057
|
-
memset(logits_out, 0, lctx.logits_size*sizeof(float));
|
9058
|
-
#endif
|
10203
|
+
uint32_t n_outputs = 0;
|
10204
|
+
uint32_t n_outputs_prev = 0;
|
9059
10205
|
|
9060
10206
|
const auto n_ubatch = cparams.n_ubatch;
|
9061
10207
|
|
@@ -9064,6 +10210,38 @@ static int llama_decode_internal(
|
|
9064
10210
|
std::vector<llama_seq_id *> seq_id_arr;
|
9065
10211
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
9066
10212
|
|
10213
|
+
// count outputs
|
10214
|
+
if (batch_all.logits) {
|
10215
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10216
|
+
n_outputs += batch_all.logits[i] != 0;
|
10217
|
+
}
|
10218
|
+
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
10219
|
+
n_outputs = n_tokens_all;
|
10220
|
+
} else {
|
10221
|
+
// keep last output only
|
10222
|
+
n_outputs = 1;
|
10223
|
+
}
|
10224
|
+
|
10225
|
+
// reserve output buffer
|
10226
|
+
if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
|
10227
|
+
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
|
10228
|
+
return -2;
|
10229
|
+
};
|
10230
|
+
|
10231
|
+
// set output mappings
|
10232
|
+
if (batch_all.logits) {
|
10233
|
+
int32_t i_logits = 0;
|
10234
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10235
|
+
if (batch_all.logits[i]) {
|
10236
|
+
lctx.output_ids[i] = i_logits++;
|
10237
|
+
}
|
10238
|
+
}
|
10239
|
+
} else {
|
10240
|
+
for (uint32_t i = 0; i < n_outputs; ++i) {
|
10241
|
+
lctx.output_ids[i] = i;
|
10242
|
+
}
|
10243
|
+
}
|
10244
|
+
|
9067
10245
|
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
9068
10246
|
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
9069
10247
|
llama_batch u_batch = {
|
@@ -9079,6 +10257,27 @@ static int llama_decode_internal(
|
|
9079
10257
|
/* .all_seq_id = */ batch_all.all_seq_id,
|
9080
10258
|
};
|
9081
10259
|
|
10260
|
+
// count the outputs in this u_batch
|
10261
|
+
{
|
10262
|
+
int32_t n_outputs_new = 0;
|
10263
|
+
|
10264
|
+
if (u_batch.logits) {
|
10265
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
10266
|
+
n_outputs_new += u_batch.logits[i] != 0;
|
10267
|
+
}
|
10268
|
+
} else if (n_outputs == n_tokens_all) {
|
10269
|
+
n_outputs_new = n_tokens;
|
10270
|
+
} else {
|
10271
|
+
// keep last output only
|
10272
|
+
if (cur_token + n_tokens >= n_tokens_all) {
|
10273
|
+
n_outputs_new = 1;
|
10274
|
+
}
|
10275
|
+
}
|
10276
|
+
|
10277
|
+
// needs to happen before the graph is built
|
10278
|
+
lctx.n_outputs = n_outputs_new;
|
10279
|
+
}
|
10280
|
+
|
9082
10281
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
9083
10282
|
GGML_ASSERT(n_threads > 0);
|
9084
10283
|
|
@@ -9142,23 +10341,37 @@ static int llama_decode_internal(
|
|
9142
10341
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
9143
10342
|
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
9144
10343
|
|
9145
|
-
if (
|
10344
|
+
if (lctx.n_outputs == 0) {
|
10345
|
+
// no output
|
10346
|
+
res = nullptr;
|
10347
|
+
embd = nullptr;
|
10348
|
+
} else if (!hparams.causal_attn) {
|
9146
10349
|
res = nullptr; // do not extract logits for embedding models such as BERT
|
9147
10350
|
|
9148
10351
|
// token or sequence embeddings
|
9149
10352
|
embd = gf->nodes[gf->n_nodes - 1];
|
9150
10353
|
|
9151
10354
|
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
9152
|
-
} else {
|
9153
|
-
|
9154
|
-
|
9155
|
-
|
9156
|
-
|
9157
|
-
|
9158
|
-
|
9159
|
-
} else {
|
9160
|
-
GGML_ASSERT(false && "missing result_output tensor");
|
10355
|
+
} else if (cparams.embeddings) {
|
10356
|
+
// the embeddings could be in the second to last tensor, or any of the previous tensors
|
10357
|
+
int i_embd = gf->n_nodes - 2;
|
10358
|
+
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
|
10359
|
+
i_embd = gf->n_nodes - i;
|
10360
|
+
if (i_embd < 0) { break; }
|
10361
|
+
embd = gf->nodes[i_embd];
|
9161
10362
|
}
|
10363
|
+
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
10364
|
+
|
10365
|
+
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
10366
|
+
if (!cparams.causal_attn) {
|
10367
|
+
res = nullptr; // do not extract logits when not needed
|
10368
|
+
// skip computing logits
|
10369
|
+
// TODO: is this safe?
|
10370
|
+
gf->n_nodes = i_embd + 1;
|
10371
|
+
}
|
10372
|
+
} else {
|
10373
|
+
embd = nullptr; // do not extract embeddings when not needed
|
10374
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
9162
10375
|
}
|
9163
10376
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
9164
10377
|
|
@@ -9201,50 +10414,23 @@ static int llama_decode_internal(
|
|
9201
10414
|
//}
|
9202
10415
|
|
9203
10416
|
// extract logits
|
9204
|
-
// TODO: do not compute and extract logits if only embeddings are needed
|
9205
|
-
// update the graphs to skip "result_output" if logits are not needed
|
9206
10417
|
if (res) {
|
9207
10418
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
9208
10419
|
GGML_ASSERT(backend_res != nullptr);
|
9209
|
-
|
9210
|
-
|
9211
|
-
|
9212
|
-
|
9213
|
-
|
9214
|
-
|
9215
|
-
|
9216
|
-
|
9217
|
-
|
9218
|
-
// extract logits for the range [i_first, i_last)
|
9219
|
-
// group the requests to minimize the number of calls to the backend
|
9220
|
-
ggml_backend_tensor_get_async(backend_res, res,
|
9221
|
-
logits_out + n_vocab*(cur_token + i_first),
|
9222
|
-
i_first*n_vocab*sizeof(float),
|
9223
|
-
(i_last - i_first)*n_vocab*sizeof(float));
|
9224
|
-
i_first = -1;
|
9225
|
-
}
|
9226
|
-
}
|
9227
|
-
#ifndef NDEBUG
|
9228
|
-
logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
|
9229
|
-
#endif
|
9230
|
-
}
|
9231
|
-
} else if (lctx.logits_all) {
|
9232
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
|
9233
|
-
#ifndef NDEBUG
|
9234
|
-
std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
|
9235
|
-
#endif
|
9236
|
-
} else {
|
9237
|
-
if (cur_token + n_tokens >= n_tokens_all) {
|
9238
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
|
9239
|
-
#ifndef NDEBUG
|
9240
|
-
logits_valid[0] = true;
|
9241
|
-
#endif
|
9242
|
-
}
|
10420
|
+
GGML_ASSERT(lctx.logits != nullptr);
|
10421
|
+
|
10422
|
+
float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
|
10423
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10424
|
+
|
10425
|
+
if (n_outputs_new) {
|
10426
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10427
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
|
10428
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
|
9243
10429
|
}
|
9244
10430
|
}
|
9245
10431
|
|
9246
10432
|
// extract embeddings
|
9247
|
-
if (
|
10433
|
+
if (embd) {
|
9248
10434
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
9249
10435
|
GGML_ASSERT(backend_embd != nullptr);
|
9250
10436
|
|
@@ -9252,16 +10438,14 @@ static int llama_decode_internal(
|
|
9252
10438
|
case LLAMA_POOLING_TYPE_NONE:
|
9253
10439
|
{
|
9254
10440
|
// extract token embeddings
|
9255
|
-
|
9256
|
-
|
9257
|
-
|
9258
|
-
|
9259
|
-
|
9260
|
-
|
9261
|
-
|
9262
|
-
|
9263
|
-
ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
9264
|
-
}
|
10441
|
+
GGML_ASSERT(lctx.embd != nullptr);
|
10442
|
+
float * embd_out = lctx.embd + n_outputs_prev*n_embd;
|
10443
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10444
|
+
|
10445
|
+
if (n_outputs_new) {
|
10446
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10447
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
|
10448
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
9265
10449
|
}
|
9266
10450
|
} break;
|
9267
10451
|
case LLAMA_POOLING_TYPE_CLS:
|
@@ -9288,8 +10472,12 @@ static int llama_decode_internal(
|
|
9288
10472
|
} break;
|
9289
10473
|
}
|
9290
10474
|
}
|
10475
|
+
n_outputs_prev += lctx.n_outputs;
|
9291
10476
|
}
|
9292
10477
|
|
10478
|
+
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
10479
|
+
lctx.n_outputs = n_outputs;
|
10480
|
+
|
9293
10481
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
9294
10482
|
//llama_synchronize(&lctx);
|
9295
10483
|
|
@@ -9933,7 +11121,7 @@ struct llm_tokenizer_bpe {
|
|
9933
11121
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
9934
11122
|
}
|
9935
11123
|
|
9936
|
-
// add the
|
11124
|
+
// add the finished tokens to the final list keeping correct order for next and prev
|
9937
11125
|
for (auto & sym : symbols) {
|
9938
11126
|
if (sym.n > 0) {
|
9939
11127
|
sym.prev = final_prev_index;
|
@@ -10202,9 +11390,6 @@ struct llm_tokenizer_wpm {
|
|
10202
11390
|
output.push_back(vocab.special_unk_id);
|
10203
11391
|
}
|
10204
11392
|
}
|
10205
|
-
|
10206
|
-
// append eos token
|
10207
|
-
output.push_back(vocab.special_eos_id);
|
10208
11393
|
}
|
10209
11394
|
|
10210
11395
|
std::vector<std::string> preprocess(const std::string & text) {
|
@@ -10218,7 +11403,7 @@ struct llm_tokenizer_wpm {
|
|
10218
11403
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
10219
11404
|
continue;
|
10220
11405
|
}
|
10221
|
-
code =
|
11406
|
+
code = unicode_tolower(code);
|
10222
11407
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
10223
11408
|
code = ' ';
|
10224
11409
|
}
|
@@ -10238,7 +11423,7 @@ struct llm_tokenizer_wpm {
|
|
10238
11423
|
std::vector<std::string> words;
|
10239
11424
|
while (r < new_str.size()) {
|
10240
11425
|
// if is whitespace
|
10241
|
-
if (isspace(new_str[r])) {
|
11426
|
+
if (isspace(new_str[r], std::locale::classic())) {
|
10242
11427
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
10243
11428
|
l = r + 1;
|
10244
11429
|
r = l;
|
@@ -10252,18 +11437,12 @@ struct llm_tokenizer_wpm {
|
|
10252
11437
|
return words;
|
10253
11438
|
}
|
10254
11439
|
|
10255
|
-
uint32_t to_lower(uint32_t code) {
|
10256
|
-
static const std::locale locale("en_US.UTF-8");
|
10257
|
-
#if defined(_WIN32)
|
10258
|
-
if (code > 0xFFFF) {
|
10259
|
-
return code;
|
10260
|
-
}
|
10261
|
-
#endif
|
10262
|
-
return std::tolower(wchar_t(code), locale);
|
10263
|
-
}
|
10264
|
-
|
10265
11440
|
bool is_ascii_punct(uint32_t code) {
|
10266
|
-
|
11441
|
+
if (code > 0xFF) {
|
11442
|
+
return false;
|
11443
|
+
}
|
11444
|
+
auto c = char(static_cast<unsigned char>(code));
|
11445
|
+
return ispunct(c, std::locale::classic());
|
10267
11446
|
}
|
10268
11447
|
|
10269
11448
|
bool is_chinese_char(uint32_t cpt) {
|
@@ -10415,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
10415
11594
|
}
|
10416
11595
|
}
|
10417
11596
|
|
10418
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool
|
11597
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
|
10419
11598
|
std::vector<llama_vocab::id> output;
|
10420
|
-
|
10421
|
-
// OG tokenizer behavior:
|
10422
|
-
//
|
10423
|
-
// tokenizer.encode('', add_bos=True) returns [1]
|
10424
|
-
// tokenizer.encode('', add_bos=False) returns []
|
10425
|
-
|
10426
|
-
if (bos && vocab.special_bos_id != -1) {
|
10427
|
-
output.push_back(vocab.special_bos_id);
|
10428
|
-
}
|
10429
|
-
|
10430
|
-
if (raw_text.empty()) {
|
10431
|
-
return output;
|
10432
|
-
}
|
10433
|
-
|
10434
11599
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
10435
|
-
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
10436
11600
|
|
10437
|
-
if (
|
11601
|
+
if (!raw_text.empty()) {
|
11602
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
11603
|
+
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
|
11604
|
+
}
|
10438
11605
|
|
10439
11606
|
switch (vocab.type) {
|
10440
11607
|
case LLAMA_VOCAB_TYPE_SPM:
|
10441
11608
|
{
|
11609
|
+
// OG tokenizer behavior:
|
11610
|
+
//
|
11611
|
+
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
11612
|
+
// tokenizer.encode('', add_special_tokens=False) returns []
|
11613
|
+
|
11614
|
+
if (add_special && vocab.special_add_bos != 0) {
|
11615
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
11616
|
+
output.push_back(vocab.special_bos_id);
|
11617
|
+
}
|
11618
|
+
|
10442
11619
|
for (const auto & fragment : fragment_buffer) {
|
10443
11620
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
10444
11621
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
@@ -10464,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10464
11641
|
output.push_back(fragment.token);
|
10465
11642
|
}
|
10466
11643
|
}
|
11644
|
+
|
11645
|
+
if (add_special && vocab.special_add_eos == 1) {
|
11646
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
11647
|
+
output.push_back(vocab.special_eos_id);
|
11648
|
+
}
|
10467
11649
|
} break;
|
10468
11650
|
case LLAMA_VOCAB_TYPE_BPE:
|
10469
11651
|
{
|
11652
|
+
if (add_special && vocab.special_add_bos == 1) {
|
11653
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
11654
|
+
output.push_back(vocab.special_bos_id);
|
11655
|
+
}
|
11656
|
+
|
10470
11657
|
for (const auto & fragment : fragment_buffer) {
|
10471
11658
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
10472
11659
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -10480,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10480
11667
|
output.push_back(fragment.token);
|
10481
11668
|
}
|
10482
11669
|
}
|
11670
|
+
|
11671
|
+
GGML_ASSERT(vocab.special_add_eos != 1);
|
10483
11672
|
} break;
|
10484
11673
|
case LLAMA_VOCAB_TYPE_WPM:
|
10485
11674
|
{
|
11675
|
+
if (add_special) {
|
11676
|
+
GGML_ASSERT(vocab.special_cls_id != -1);
|
11677
|
+
output.push_back(vocab.special_cls_id);
|
11678
|
+
}
|
11679
|
+
|
10486
11680
|
for (const auto & fragment : fragment_buffer) {
|
10487
11681
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
10488
11682
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -10496,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10496
11690
|
output.push_back(fragment.token);
|
10497
11691
|
}
|
10498
11692
|
}
|
11693
|
+
|
11694
|
+
if (add_special) {
|
11695
|
+
GGML_ASSERT(vocab.special_sep_id != -1);
|
11696
|
+
output.push_back(vocab.special_sep_id);
|
11697
|
+
}
|
10499
11698
|
} break;
|
10500
11699
|
case LLAMA_VOCAB_TYPE_NONE:
|
10501
11700
|
GGML_ASSERT(false);
|
@@ -10508,28 +11707,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10508
11707
|
// grammar - internal
|
10509
11708
|
//
|
10510
11709
|
|
10511
|
-
struct llama_partial_utf8 {
|
10512
|
-
uint32_t value; // bit value so far (unshifted)
|
10513
|
-
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
10514
|
-
};
|
10515
|
-
|
10516
|
-
struct llama_grammar {
|
10517
|
-
const std::vector<std::vector<llama_grammar_element>> rules;
|
10518
|
-
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
10519
|
-
|
10520
|
-
// buffer for partially generated UTF-8 sequence from accepted tokens
|
10521
|
-
llama_partial_utf8 partial_utf8;
|
10522
|
-
};
|
10523
|
-
|
10524
|
-
struct llama_grammar_candidate {
|
10525
|
-
size_t index;
|
10526
|
-
const uint32_t * code_points;
|
10527
|
-
llama_partial_utf8 partial_utf8;
|
10528
|
-
};
|
10529
11710
|
|
10530
11711
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
10531
11712
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
10532
|
-
|
11713
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
10533
11714
|
const std::string & src,
|
10534
11715
|
llama_partial_utf8 partial_start) {
|
10535
11716
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -10680,7 +11861,9 @@ static void llama_grammar_advance_stack(
|
|
10680
11861
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
10681
11862
|
|
10682
11863
|
if (stack.empty()) {
|
10683
|
-
new_stacks.
|
11864
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
11865
|
+
new_stacks.emplace_back(stack);
|
11866
|
+
}
|
10684
11867
|
return;
|
10685
11868
|
}
|
10686
11869
|
|
@@ -10717,7 +11900,10 @@ static void llama_grammar_advance_stack(
|
|
10717
11900
|
}
|
10718
11901
|
case LLAMA_GRETYPE_CHAR:
|
10719
11902
|
case LLAMA_GRETYPE_CHAR_NOT:
|
10720
|
-
new_stacks.
|
11903
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
11904
|
+
// only add the stack if it's not a duplicate of one we already have
|
11905
|
+
new_stacks.emplace_back(stack);
|
11906
|
+
}
|
10721
11907
|
break;
|
10722
11908
|
default:
|
10723
11909
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -10731,12 +11917,13 @@ static void llama_grammar_advance_stack(
|
|
10731
11917
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
10732
11918
|
// produces the N possible stacks if the given char is accepted at those
|
10733
11919
|
// positions
|
10734
|
-
|
11920
|
+
void llama_grammar_accept(
|
10735
11921
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
10736
11922
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
10737
|
-
const uint32_t chr
|
11923
|
+
const uint32_t chr,
|
11924
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
10738
11925
|
|
10739
|
-
|
11926
|
+
new_stacks.clear();
|
10740
11927
|
|
10741
11928
|
for (const auto & stack : stacks) {
|
10742
11929
|
if (stack.empty()) {
|
@@ -10755,8 +11942,6 @@ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_acc
|
|
10755
11942
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
10756
11943
|
}
|
10757
11944
|
}
|
10758
|
-
|
10759
|
-
return new_stacks;
|
10760
11945
|
}
|
10761
11946
|
|
10762
11947
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
@@ -10770,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
10770
11955
|
const std::vector<llama_grammar_candidate> & candidates) {
|
10771
11956
|
|
10772
11957
|
std::vector<llama_grammar_candidate> rejects;
|
11958
|
+
rejects.reserve(candidates.size());
|
10773
11959
|
|
10774
11960
|
if (stack.empty()) {
|
10775
11961
|
for (const auto & tok : candidates) {
|
@@ -10783,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
10783
11969
|
const llama_grammar_element * stack_pos = stack.back();
|
10784
11970
|
|
10785
11971
|
std::vector<llama_grammar_candidate> next_candidates;
|
11972
|
+
next_candidates.reserve(candidates.size());
|
11973
|
+
|
10786
11974
|
for (const auto & tok : candidates) {
|
10787
11975
|
if (*tok.code_points == 0) {
|
10788
11976
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
@@ -11590,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
11590
12778
|
// Note terminating 0 in decoded string
|
11591
12779
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
11592
12780
|
const auto & code_points = decoded.first;
|
12781
|
+
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
11593
12782
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
11594
|
-
|
12783
|
+
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
12784
|
+
grammar->stacks = tmp_new_stacks;
|
11595
12785
|
}
|
11596
12786
|
grammar->partial_utf8 = decoded.second;
|
11597
12787
|
GGML_ASSERT(!grammar->stacks.empty());
|
@@ -11957,7 +13147,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11957
13147
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
11958
13148
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
11959
13149
|
// tensor name.
|
11960
|
-
n_layer /= n_expert;
|
11961
13150
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
11962
13151
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
11963
13152
|
}
|
@@ -11971,30 +13160,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11971
13160
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
11972
13161
|
// with the quantization of the output tensor
|
11973
13162
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
11974
|
-
|
11975
|
-
|
11976
|
-
|
11977
|
-
|
11978
|
-
|
11979
|
-
|
11980
|
-
|
11981
|
-
|
11982
|
-
|
11983
|
-
|
13163
|
+
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
13164
|
+
new_type = qs.params->output_tensor_type;
|
13165
|
+
} else {
|
13166
|
+
int nx = tensor->ne[0];
|
13167
|
+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
13168
|
+
new_type = GGML_TYPE_Q8_0;
|
13169
|
+
}
|
13170
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
13171
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
13172
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13173
|
+
new_type = GGML_TYPE_Q5_K;
|
13174
|
+
}
|
13175
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
13176
|
+
new_type = GGML_TYPE_Q6_K;
|
13177
|
+
}
|
11984
13178
|
}
|
11985
13179
|
} else if (name == "token_embd.weight") {
|
11986
|
-
if (
|
11987
|
-
|
11988
|
-
|
11989
|
-
|
11990
|
-
|
11991
|
-
|
11992
|
-
|
11993
|
-
|
11994
|
-
|
13180
|
+
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
13181
|
+
new_type = qs.params->token_embedding_type;
|
13182
|
+
} else {
|
13183
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
13184
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13185
|
+
new_type = GGML_TYPE_Q2_K;
|
13186
|
+
}
|
13187
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13188
|
+
new_type = GGML_TYPE_IQ3_S;
|
13189
|
+
}
|
13190
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
13191
|
+
new_type = GGML_TYPE_IQ3_S;
|
13192
|
+
}
|
11995
13193
|
}
|
11996
13194
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
11997
|
-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13195
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
11998
13196
|
if (name.find("attn_v.weight") != std::string::npos) {
|
11999
13197
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
12000
13198
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
@@ -12013,7 +13211,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12013
13211
|
if (qs.model.hparams.n_expert == 8) {
|
12014
13212
|
new_type = GGML_TYPE_Q5_K;
|
12015
13213
|
} else {
|
12016
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
13214
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
|
12017
13215
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
12018
13216
|
}
|
12019
13217
|
}
|
@@ -12027,13 +13225,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12027
13225
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
12028
13226
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
12029
13227
|
}
|
12030
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
12031
|
-
new_type = GGML_TYPE_Q4_K;
|
12032
|
-
}
|
12033
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
12034
|
-
new_type = GGML_TYPE_Q4_K;
|
12035
|
-
}
|
12036
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
13228
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
12037
13229
|
new_type = GGML_TYPE_Q4_K;
|
12038
13230
|
}
|
12039
13231
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
@@ -12186,7 +13378,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12186
13378
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
12187
13379
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
12188
13380
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
12189
|
-
new_type == GGML_TYPE_IQ3_XXS ||
|
13381
|
+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
13382
|
+
new_type == GGML_TYPE_IQ1_M) {
|
12190
13383
|
int nx = tensor->ne[0];
|
12191
13384
|
int ny = tensor->ne[1];
|
12192
13385
|
if (nx % QK_K != 0) {
|
@@ -12204,6 +13397,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12204
13397
|
case GGML_TYPE_IQ3_XXS:
|
12205
13398
|
case GGML_TYPE_IQ3_S:
|
12206
13399
|
case GGML_TYPE_IQ1_S:
|
13400
|
+
case GGML_TYPE_IQ1_M:
|
12207
13401
|
case GGML_TYPE_Q2_K:
|
12208
13402
|
case GGML_TYPE_Q3_K:
|
12209
13403
|
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
@@ -12219,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12219
13413
|
return new_type;
|
12220
13414
|
}
|
12221
13415
|
|
12222
|
-
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const
|
13416
|
+
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
12223
13417
|
std::mutex mutex;
|
12224
|
-
|
13418
|
+
int64_t counter = 0;
|
12225
13419
|
size_t new_size = 0;
|
12226
13420
|
if (nthread < 2) {
|
12227
13421
|
// single-thread
|
@@ -12229,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
12229
13423
|
}
|
12230
13424
|
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
12231
13425
|
nrows, n_per_row, imatrix]() {
|
12232
|
-
const
|
13426
|
+
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
12233
13427
|
size_t local_size = 0;
|
12234
13428
|
while (true) {
|
12235
13429
|
std::unique_lock<std::mutex> lock(mutex);
|
12236
|
-
|
13430
|
+
int64_t first_row = counter; counter += nrows_per_chunk;
|
12237
13431
|
if (first_row >= nrows) {
|
12238
13432
|
if (local_size > 0) {
|
12239
13433
|
new_size += local_size;
|
@@ -12241,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
12241
13435
|
break;
|
12242
13436
|
}
|
12243
13437
|
lock.unlock();
|
12244
|
-
const
|
13438
|
+
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
12245
13439
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
12246
13440
|
}
|
12247
13441
|
};
|
@@ -12285,6 +13479,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12285
13479
|
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
|
12286
13480
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
|
12287
13481
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
|
13482
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
|
12288
13483
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
|
12289
13484
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
12290
13485
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
@@ -12307,8 +13502,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12307
13502
|
constexpr bool use_mmap = false;
|
12308
13503
|
#endif
|
12309
13504
|
|
12310
|
-
|
12311
|
-
|
13505
|
+
llama_model_kv_override * kv_overrides = nullptr;
|
13506
|
+
if (params->kv_overrides) {
|
13507
|
+
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
13508
|
+
kv_overrides = v->data();
|
13509
|
+
}
|
13510
|
+
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
13511
|
+
ml.init_mappings(false); // no prefetching
|
12312
13512
|
|
12313
13513
|
llama_model model;
|
12314
13514
|
llm_load_arch(ml, model);
|
@@ -12332,36 +13532,48 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12332
13532
|
struct gguf_context * ctx_out = gguf_init_empty();
|
12333
13533
|
|
12334
13534
|
// copy the KV pairs from the input file
|
12335
|
-
gguf_set_kv (ctx_out, ml.
|
13535
|
+
gguf_set_kv (ctx_out, ml.meta);
|
12336
13536
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
12337
13537
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
12338
13538
|
|
13539
|
+
if (params->kv_overrides) {
|
13540
|
+
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
13541
|
+
for (auto & o : overrides) {
|
13542
|
+
if (o.key[0] == 0) break;
|
13543
|
+
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
13544
|
+
gguf_set_val_f32(ctx_out, o.key, o.float_value);
|
13545
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
13546
|
+
gguf_set_val_i32(ctx_out, o.key, o.int_value);
|
13547
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
13548
|
+
gguf_set_val_bool(ctx_out, o.key, o.bool_value);
|
13549
|
+
} else {
|
13550
|
+
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
13551
|
+
}
|
13552
|
+
}
|
13553
|
+
}
|
13554
|
+
|
12339
13555
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12340
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13556
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12341
13557
|
|
12342
13558
|
const std::string name = ggml_get_name(meta);
|
12343
13559
|
|
12344
13560
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
12345
|
-
if (name.find("attn_v.weight")
|
13561
|
+
if (name.find("attn_v.weight") != std::string::npos ||
|
13562
|
+
name.find("attn_qkv.weight") != std::string::npos) {
|
12346
13563
|
++qs.n_attention_wv;
|
12347
|
-
}
|
12348
|
-
else if (name.find("ffn_down") != std::string::npos) {
|
12349
|
-
++qs.n_ffn_down;
|
12350
|
-
}
|
12351
|
-
else if (name.find("ffn_gate") != std::string::npos) {
|
12352
|
-
++qs.n_ffn_gate;
|
12353
|
-
}
|
12354
|
-
else if (name.find("ffn_up") != std::string::npos) {
|
12355
|
-
++qs.n_ffn_up;
|
12356
|
-
}
|
12357
|
-
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13564
|
+
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
12358
13565
|
qs.has_output = true;
|
12359
13566
|
}
|
12360
13567
|
}
|
12361
|
-
|
12362
|
-
|
12363
|
-
|
12364
|
-
|
13568
|
+
|
13569
|
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13570
|
+
|
13571
|
+
// sanity checks
|
13572
|
+
//
|
13573
|
+
// - qs.n_attention_wv == 0 for Mamba models
|
13574
|
+
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
13575
|
+
//
|
13576
|
+
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
12365
13577
|
|
12366
13578
|
size_t total_size_org = 0;
|
12367
13579
|
size_t total_size_new = 0;
|
@@ -12377,7 +13589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12377
13589
|
|
12378
13590
|
// populate the original tensors so we get an initial meta data
|
12379
13591
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12380
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13592
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12381
13593
|
gguf_add_tensor(ctx_out, meta);
|
12382
13594
|
}
|
12383
13595
|
|
@@ -12391,6 +13603,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12391
13603
|
// placeholder for the meta data
|
12392
13604
|
::zeros(fout, meta_size);
|
12393
13605
|
|
13606
|
+
const auto tn = LLM_TN(model.arch);
|
13607
|
+
|
12394
13608
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12395
13609
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
12396
13610
|
|
@@ -12413,8 +13627,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12413
13627
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
12414
13628
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
12415
13629
|
|
12416
|
-
// quantize only 2D tensors
|
12417
|
-
quantize &= (ggml_n_dims(tensor)
|
13630
|
+
// quantize only 2D and 3D tensors (experts)
|
13631
|
+
quantize &= (ggml_n_dims(tensor) >= 2);
|
13632
|
+
|
13633
|
+
// do not quantize norm tensors
|
13634
|
+
quantize &= name.find("_norm.weight") == std::string::npos;
|
13635
|
+
|
12418
13636
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
12419
13637
|
quantize &= !params->only_copy;
|
12420
13638
|
|
@@ -12443,6 +13661,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12443
13661
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
12444
13662
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
12445
13663
|
}
|
13664
|
+
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13665
|
+
new_type = params->token_embedding_type;
|
13666
|
+
}
|
13667
|
+
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13668
|
+
new_type = params->output_tensor_type;
|
13669
|
+
}
|
12446
13670
|
|
12447
13671
|
// If we've decided to quantize to the same type the tensor is already
|
12448
13672
|
// in then there's nothing to do.
|
@@ -12455,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12455
13679
|
new_size = ggml_nbytes(tensor);
|
12456
13680
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
12457
13681
|
} else {
|
12458
|
-
const
|
13682
|
+
const int64_t nelements = ggml_nelements(tensor);
|
12459
13683
|
|
12460
13684
|
const float * imatrix = nullptr;
|
12461
13685
|
if (imatrix_data) {
|
@@ -12463,11 +13687,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12463
13687
|
if (it == imatrix_data->end()) {
|
12464
13688
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
12465
13689
|
} else {
|
12466
|
-
if (it->second.size() == (size_t)tensor->ne[0]) {
|
13690
|
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
12467
13691
|
imatrix = it->second.data();
|
12468
13692
|
} else {
|
12469
13693
|
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
12470
|
-
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
13694
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
13695
|
+
|
13696
|
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
13697
|
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
13698
|
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
13699
|
+
// tok_embd should be ignored in this case, since it always causes this warning
|
13700
|
+
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
13701
|
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
13702
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
13703
|
+
}
|
12471
13704
|
}
|
12472
13705
|
}
|
12473
13706
|
}
|
@@ -12475,6 +13708,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12475
13708
|
new_type == GGML_TYPE_IQ2_XS ||
|
12476
13709
|
new_type == GGML_TYPE_IQ2_S ||
|
12477
13710
|
new_type == GGML_TYPE_IQ1_S ||
|
13711
|
+
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
12478
13712
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
12479
13713
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
12480
13714
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -12497,21 +13731,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12497
13731
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
12498
13732
|
fflush(stdout);
|
12499
13733
|
|
12500
|
-
if (work.size() < nelements * 4) {
|
13734
|
+
if (work.size() < (size_t)nelements * 4) {
|
12501
13735
|
work.resize(nelements * 4); // upper bound on size
|
12502
13736
|
}
|
12503
13737
|
new_data = work.data();
|
12504
13738
|
|
12505
|
-
const
|
12506
|
-
const
|
13739
|
+
const int64_t n_per_row = tensor->ne[0];
|
13740
|
+
const int64_t nrows = tensor->ne[1];
|
13741
|
+
|
13742
|
+
static const int64_t min_chunk_size = 32 * 512;
|
13743
|
+
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
12507
13744
|
|
12508
|
-
|
12509
|
-
const
|
13745
|
+
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
13746
|
+
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
13747
|
+
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
12510
13748
|
|
12511
|
-
|
12512
|
-
|
12513
|
-
|
13749
|
+
// quantize each expert separately since they have different importance matrices
|
13750
|
+
new_size = 0;
|
13751
|
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
13752
|
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
13753
|
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
13754
|
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
12514
13755
|
|
13756
|
+
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
13757
|
+
}
|
12515
13758
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
12516
13759
|
}
|
12517
13760
|
total_size_org += ggml_nbytes(tensor);
|
@@ -12582,7 +13825,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12582
13825
|
if (path_base_model) {
|
12583
13826
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
12584
13827
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
12585
|
-
ml->
|
13828
|
+
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
12586
13829
|
}
|
12587
13830
|
|
12588
13831
|
struct tensor_meta {
|
@@ -12703,7 +13946,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12703
13946
|
|
12704
13947
|
ggml_tensor * base_t;
|
12705
13948
|
if (ml) {
|
12706
|
-
if (
|
13949
|
+
if (!ml->get_tensor_meta(base_name.c_str())) {
|
12707
13950
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
12708
13951
|
return 1;
|
12709
13952
|
}
|
@@ -12887,11 +14130,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12887
14130
|
struct llama_model_quantize_params result = {
|
12888
14131
|
/*.nthread =*/ 0,
|
12889
14132
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
14133
|
+
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
14134
|
+
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
12890
14135
|
/*.allow_requantize =*/ false,
|
12891
14136
|
/*.quantize_output_tensor =*/ true,
|
12892
14137
|
/*.only_copy =*/ false,
|
12893
14138
|
/*.pure =*/ false,
|
12894
14139
|
/*.imatrix =*/ nullptr,
|
14140
|
+
/*.kv_overrides =*/ nullptr,
|
12895
14141
|
};
|
12896
14142
|
|
12897
14143
|
return result;
|
@@ -12900,7 +14146,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12900
14146
|
size_t llama_max_devices(void) {
|
12901
14147
|
#if defined(GGML_USE_METAL)
|
12902
14148
|
return 1;
|
12903
|
-
#elif defined(
|
14149
|
+
#elif defined(GGML_USE_CUDA)
|
12904
14150
|
return GGML_CUDA_MAX_DEVICES;
|
12905
14151
|
#elif defined(GGML_USE_SYCL)
|
12906
14152
|
return GGML_SYCL_MAX_DEVICES;
|
@@ -12920,8 +14166,8 @@ bool llama_supports_mlock(void) {
|
|
12920
14166
|
}
|
12921
14167
|
|
12922
14168
|
bool llama_supports_gpu_offload(void) {
|
12923
|
-
#if defined(
|
12924
|
-
defined(GGML_USE_SYCL)
|
14169
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
14170
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
12925
14171
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
12926
14172
|
return true;
|
12927
14173
|
#else
|
@@ -13028,7 +14274,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13028
14274
|
const auto & hparams = model->hparams;
|
13029
14275
|
auto & cparams = ctx->cparams;
|
13030
14276
|
|
13031
|
-
|
14277
|
+
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
13032
14278
|
cparams.n_threads = params.n_threads;
|
13033
14279
|
cparams.n_threads_batch = params.n_threads_batch;
|
13034
14280
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
@@ -13126,7 +14372,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13126
14372
|
}
|
13127
14373
|
ctx->backends.push_back(ctx->backend_metal);
|
13128
14374
|
}
|
13129
|
-
#elif defined(
|
14375
|
+
#elif defined(GGML_USE_CUDA)
|
13130
14376
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
13131
14377
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
13132
14378
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
@@ -13149,7 +14395,20 @@ struct llama_context * llama_new_context_with_model(
|
|
13149
14395
|
}
|
13150
14396
|
}
|
13151
14397
|
#elif defined(GGML_USE_VULKAN)
|
13152
|
-
if (model->
|
14398
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14399
|
+
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
14400
|
+
llama_free(ctx);
|
14401
|
+
return nullptr;
|
14402
|
+
}
|
14403
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
14404
|
+
ggml_backend_t backend = ggml_backend_vk_init(0);
|
14405
|
+
if (backend == nullptr) {
|
14406
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
14407
|
+
llama_free(ctx);
|
14408
|
+
return nullptr;
|
14409
|
+
}
|
14410
|
+
ctx->backends.push_back(backend);
|
14411
|
+
} else {
|
13153
14412
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
13154
14413
|
ggml_backend_t backend = ggml_backend_vk_init(device);
|
13155
14414
|
if (backend == nullptr) {
|
@@ -13161,30 +14420,28 @@ struct llama_context * llama_new_context_with_model(
|
|
13161
14420
|
}
|
13162
14421
|
}
|
13163
14422
|
#elif defined(GGML_USE_SYCL)
|
13164
|
-
|
13165
|
-
|
13166
|
-
|
13167
|
-
|
14423
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
14424
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14425
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
14426
|
+
if (backend == nullptr) {
|
14427
|
+
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
14428
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
14429
|
+
llama_free(ctx);
|
14430
|
+
return nullptr;
|
14431
|
+
}
|
14432
|
+
ctx->backends.push_back(backend);
|
14433
|
+
} else {
|
14434
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
14435
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
14436
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
13168
14437
|
if (backend == nullptr) {
|
13169
|
-
int
|
13170
|
-
|
14438
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
14439
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
14440
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
13171
14441
|
llama_free(ctx);
|
13172
14442
|
return nullptr;
|
13173
14443
|
}
|
13174
14444
|
ctx->backends.push_back(backend);
|
13175
|
-
} else {
|
13176
|
-
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
13177
|
-
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
13178
|
-
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
13179
|
-
if (backend == nullptr) {
|
13180
|
-
int id_list[GGML_SYCL_MAX_DEVICES];
|
13181
|
-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
13182
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
13183
|
-
llama_free(ctx);
|
13184
|
-
return nullptr;
|
13185
|
-
}
|
13186
|
-
ctx->backends.push_back(backend);
|
13187
|
-
}
|
13188
14445
|
}
|
13189
14446
|
}
|
13190
14447
|
#elif defined(GGML_USE_KOMPUTE)
|
@@ -13232,25 +14489,12 @@ struct llama_context * llama_new_context_with_model(
|
|
13232
14489
|
|
13233
14490
|
// graph outputs buffer
|
13234
14491
|
{
|
13235
|
-
// resized during inference
|
13236
|
-
ctx
|
13237
|
-
|
13238
|
-
|
13239
|
-
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
|
13240
|
-
|
13241
|
-
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
|
13242
|
-
if (ctx->buf_output == nullptr) {
|
13243
|
-
LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
|
14492
|
+
// resized during inference when a batch uses more outputs
|
14493
|
+
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
14494
|
+
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
13244
14495
|
llama_free(ctx);
|
13245
14496
|
return nullptr;
|
13246
14497
|
}
|
13247
|
-
ggml_backend_buffer_clear(ctx->buf_output, 0);
|
13248
|
-
|
13249
|
-
|
13250
|
-
ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
|
13251
|
-
if (params.embeddings) {
|
13252
|
-
ctx->embd = ctx->logits + ctx->logits_size;
|
13253
|
-
}
|
13254
14498
|
|
13255
14499
|
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
|
13256
14500
|
ggml_backend_buffer_name(ctx->buf_output),
|
@@ -13275,7 +14519,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13275
14519
|
|
13276
14520
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
13277
14521
|
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
13278
|
-
#ifndef
|
14522
|
+
#ifndef GGML_USE_CUDA
|
13279
14523
|
// pipeline parallelism requires support for async compute and events
|
13280
14524
|
// currently this is only implemented in the CUDA backend
|
13281
14525
|
pipeline_parallel = false;
|
@@ -13383,11 +14627,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
13383
14627
|
case LLM_ARCH_ORION:
|
13384
14628
|
case LLM_ARCH_INTERNLM2:
|
13385
14629
|
case LLM_ARCH_MINICPM:
|
14630
|
+
case LLM_ARCH_XVERSE:
|
13386
14631
|
case LLM_ARCH_COMMAND_R:
|
13387
14632
|
return LLAMA_ROPE_TYPE_NORM;
|
13388
14633
|
|
13389
14634
|
// the pairs of head values are offset by n_rot/2
|
13390
14635
|
case LLM_ARCH_FALCON:
|
14636
|
+
case LLM_ARCH_GROK:
|
13391
14637
|
case LLM_ARCH_PERSIMMON:
|
13392
14638
|
case LLM_ARCH_BERT:
|
13393
14639
|
case LLM_ARCH_NOMIC_BERT:
|
@@ -13763,30 +15009,60 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
13763
15009
|
llama_kv_cache_update_internal(*ctx);
|
13764
15010
|
}
|
13765
15011
|
|
15012
|
+
// deprecated
|
15013
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
15014
|
+
return llama_state_get_size(ctx);
|
15015
|
+
}
|
15016
|
+
|
15017
|
+
// deprecated
|
15018
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
15019
|
+
return llama_state_get_data(ctx, dst);
|
15020
|
+
}
|
15021
|
+
|
15022
|
+
// deprecated
|
15023
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
15024
|
+
return llama_state_set_data(ctx, src);
|
15025
|
+
}
|
15026
|
+
|
15027
|
+
// deprecated
|
15028
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15029
|
+
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15030
|
+
}
|
15031
|
+
|
15032
|
+
// deprecated
|
15033
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15034
|
+
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
|
15035
|
+
}
|
13766
15036
|
|
13767
15037
|
// Returns the *maximum* size of the state
|
13768
|
-
size_t
|
15038
|
+
size_t llama_state_get_size(const struct llama_context * ctx) {
|
15039
|
+
const auto & cparams = ctx->cparams;
|
15040
|
+
const auto & hparams = ctx->model.hparams;
|
15041
|
+
|
13769
15042
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
13770
15043
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
13771
15044
|
const size_t s_rng_size = sizeof(size_t);
|
13772
15045
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
15046
|
+
const size_t s_n_outputs = sizeof(size_t);
|
15047
|
+
// assume worst case for outputs although only currently set ones are serialized
|
15048
|
+
const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
|
13773
15049
|
const size_t s_logits_size = sizeof(size_t);
|
13774
|
-
|
13775
|
-
const size_t s_logits = ctx->logits_size * sizeof(float);
|
15050
|
+
const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
|
13776
15051
|
const size_t s_embedding_size = sizeof(size_t);
|
13777
|
-
const size_t s_embedding = ctx->embd_size * sizeof(float);
|
15052
|
+
const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
|
13778
15053
|
const size_t s_kv_buf_size = sizeof(size_t);
|
13779
15054
|
const size_t s_kv_head = sizeof(uint32_t);
|
13780
15055
|
const size_t s_kv_size = sizeof(uint32_t);
|
13781
15056
|
const size_t s_kv_used = sizeof(uint32_t);
|
13782
15057
|
const size_t s_kv = ctx->kv_self.total_size();
|
13783
|
-
|
13784
|
-
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
15058
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
13785
15059
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
13786
15060
|
|
13787
15061
|
const size_t s_total = (
|
13788
15062
|
+ s_rng_size
|
13789
15063
|
+ s_rng
|
15064
|
+
+ s_n_outputs
|
15065
|
+
+ s_output_pos
|
13790
15066
|
+ s_logits_size
|
13791
15067
|
+ s_logits
|
13792
15068
|
+ s_embedding_size
|
@@ -13847,21 +15123,21 @@ struct llama_data_file_context : llama_data_context {
|
|
13847
15123
|
* file context:
|
13848
15124
|
* llama_file file("/path", "wb");
|
13849
15125
|
* llama_data_file_context data_ctx(&file);
|
13850
|
-
*
|
15126
|
+
* llama_state_get_data(ctx, &data_ctx);
|
13851
15127
|
*
|
13852
15128
|
* buffer context:
|
13853
15129
|
* std::vector<uint8_t> buf(max_size, 0);
|
13854
15130
|
* llama_data_buffer_context data_ctx(&buf.data());
|
13855
|
-
*
|
15131
|
+
* llama_state_get_data(ctx, &data_ctx);
|
13856
15132
|
*
|
13857
15133
|
*/
|
13858
|
-
static void
|
15134
|
+
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
13859
15135
|
// copy rng
|
13860
15136
|
{
|
13861
15137
|
std::ostringstream rng_ss;
|
13862
15138
|
rng_ss << ctx->rng;
|
13863
15139
|
|
13864
|
-
const std::string & rng_str
|
15140
|
+
const std::string & rng_str = rng_ss.str();
|
13865
15141
|
const size_t rng_size = rng_str.size();
|
13866
15142
|
|
13867
15143
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
@@ -13870,25 +15146,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13870
15146
|
data_ctx->write(rng_str.data(), rng_size);
|
13871
15147
|
}
|
13872
15148
|
|
13873
|
-
// copy
|
15149
|
+
// copy outputs
|
13874
15150
|
{
|
13875
|
-
|
15151
|
+
// Can't use ctx->n_outputs because it's not for the
|
15152
|
+
// entire last batch when n_ubatch is smaller than n_batch
|
15153
|
+
size_t n_outputs = 0;
|
15154
|
+
|
15155
|
+
// copy output ids
|
15156
|
+
{
|
15157
|
+
std::vector<int32_t> output_pos;
|
13876
15158
|
|
13877
|
-
|
15159
|
+
const size_t n_batch = ctx->cparams.n_batch;
|
15160
|
+
const auto & output_ids = ctx->output_ids;
|
13878
15161
|
|
13879
|
-
|
13880
|
-
|
15162
|
+
output_pos.resize(ctx->output_size);
|
15163
|
+
|
15164
|
+
// build a more compact representation of the output ids
|
15165
|
+
for (size_t i = 0; i < n_batch; ++i) {
|
15166
|
+
// map an output id to a position in the batch
|
15167
|
+
int32_t pos = output_ids[i];
|
15168
|
+
if (pos >= 0) {
|
15169
|
+
if ((size_t) pos >= n_outputs) {
|
15170
|
+
n_outputs = pos + 1;
|
15171
|
+
}
|
15172
|
+
GGML_ASSERT((size_t) pos < ctx->output_size);
|
15173
|
+
output_pos[pos] = i;
|
15174
|
+
}
|
15175
|
+
}
|
15176
|
+
|
15177
|
+
data_ctx->write(&n_outputs, sizeof(n_outputs));
|
15178
|
+
|
15179
|
+
if (n_outputs) {
|
15180
|
+
data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
|
15181
|
+
}
|
13881
15182
|
}
|
13882
|
-
}
|
13883
15183
|
|
13884
|
-
|
13885
|
-
|
13886
|
-
|
15184
|
+
// copy logits
|
15185
|
+
{
|
15186
|
+
const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
|
13887
15187
|
|
13888
|
-
|
15188
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
13889
15189
|
|
13890
|
-
|
13891
|
-
|
15190
|
+
if (logits_size) {
|
15191
|
+
data_ctx->write(ctx->logits, logits_size * sizeof(float));
|
15192
|
+
}
|
15193
|
+
}
|
15194
|
+
|
15195
|
+
// copy embeddings
|
15196
|
+
{
|
15197
|
+
const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
|
15198
|
+
|
15199
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
15200
|
+
|
15201
|
+
if (embeddings_size) {
|
15202
|
+
data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
|
15203
|
+
}
|
13892
15204
|
}
|
13893
15205
|
}
|
13894
15206
|
|
@@ -13901,9 +15213,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13901
15213
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13902
15214
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
13903
15215
|
|
13904
|
-
|
15216
|
+
// NOTE: kv_size and kv_buf_size are mostly used for sanity checks
|
13905
15217
|
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
13906
15218
|
const uint32_t kv_size = kv_self.size;
|
15219
|
+
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
13907
15220
|
const uint32_t kv_used = kv_self.used;
|
13908
15221
|
|
13909
15222
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
@@ -13912,6 +15225,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13912
15225
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
13913
15226
|
|
13914
15227
|
if (kv_buf_size) {
|
15228
|
+
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
15229
|
+
|
13915
15230
|
std::vector<uint8_t> tmp_buf;
|
13916
15231
|
for (int il = 0; il < (int) n_layer; ++il) {
|
13917
15232
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -13941,6 +15256,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13941
15256
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
13942
15257
|
}
|
13943
15258
|
}
|
15259
|
+
GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
|
13944
15260
|
}
|
13945
15261
|
|
13946
15262
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
@@ -13959,15 +15275,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13959
15275
|
}
|
13960
15276
|
}
|
13961
15277
|
|
13962
|
-
size_t
|
15278
|
+
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
13963
15279
|
llama_data_buffer_context data_ctx(dst);
|
13964
|
-
|
15280
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
13965
15281
|
|
13966
15282
|
return data_ctx.get_size_written();
|
13967
15283
|
}
|
13968
15284
|
|
13969
15285
|
// Sets the state reading from the specified source address
|
13970
|
-
size_t
|
15286
|
+
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
13971
15287
|
const uint8_t * inp = src;
|
13972
15288
|
|
13973
15289
|
// set rng
|
@@ -13985,6 +15301,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13985
15301
|
GGML_ASSERT(!rng_ss.fail());
|
13986
15302
|
}
|
13987
15303
|
|
15304
|
+
// set output ids
|
15305
|
+
{
|
15306
|
+
size_t n_outputs;
|
15307
|
+
std::vector<int32_t> output_pos;
|
15308
|
+
|
15309
|
+
memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
|
15310
|
+
|
15311
|
+
GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
|
15312
|
+
|
15313
|
+
if (n_outputs) {
|
15314
|
+
output_pos.resize(n_outputs);
|
15315
|
+
memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
|
15316
|
+
inp += n_outputs * sizeof(int32_t);
|
15317
|
+
|
15318
|
+
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
|
15319
|
+
int32_t id = output_pos[i];
|
15320
|
+
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15321
|
+
ctx->output_ids[id] = i;
|
15322
|
+
}
|
15323
|
+
}
|
15324
|
+
}
|
15325
|
+
|
13988
15326
|
// set logits
|
13989
15327
|
{
|
13990
15328
|
size_t logits_size;
|
@@ -14005,7 +15343,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14005
15343
|
|
14006
15344
|
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
14007
15345
|
|
14008
|
-
GGML_ASSERT(ctx->embd_size
|
15346
|
+
GGML_ASSERT(ctx->embd_size >= embeddings_size);
|
14009
15347
|
|
14010
15348
|
if (embeddings_size) {
|
14011
15349
|
memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
|
@@ -14032,8 +15370,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14032
15370
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
14033
15371
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
14034
15372
|
|
15373
|
+
if (kv_self.size != kv_size) {
|
15374
|
+
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
15375
|
+
GGML_ASSERT(kv_self.size >= kv_head);
|
15376
|
+
|
15377
|
+
LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
|
15378
|
+
__func__, kv_head, kv_size, kv_self.size);
|
15379
|
+
}
|
15380
|
+
|
14035
15381
|
if (kv_buf_size) {
|
14036
|
-
|
15382
|
+
const size_t pre_kv_buf_size = inp - src;
|
15383
|
+
|
15384
|
+
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
|
14037
15385
|
|
14038
15386
|
for (int il = 0; il < (int) n_layer; ++il) {
|
14039
15387
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -14053,23 +15401,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14053
15401
|
|
14054
15402
|
// v is not contiguous, copy row by row
|
14055
15403
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
14056
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
15404
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
|
14057
15405
|
|
14058
15406
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
14059
15407
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
14060
15408
|
inp += v_row_size;
|
14061
15409
|
}
|
14062
15410
|
}
|
15411
|
+
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
14063
15412
|
}
|
14064
15413
|
|
14065
|
-
|
15414
|
+
llama_kv_cache_clear(ctx);
|
14066
15415
|
|
14067
15416
|
ctx->kv_self.head = kv_head;
|
14068
|
-
ctx->kv_self.size = kv_size;
|
14069
15417
|
ctx->kv_self.used = kv_used;
|
14070
15418
|
|
14071
|
-
ctx->kv_self.cells.resize(kv_size);
|
14072
|
-
|
14073
15419
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
14074
15420
|
llama_pos pos;
|
14075
15421
|
size_t seq_id_size;
|
@@ -14086,22 +15432,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14086
15432
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
14087
15433
|
}
|
14088
15434
|
}
|
14089
|
-
|
14090
|
-
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
14091
|
-
ctx->kv_self.cells[i].pos = -1;
|
14092
|
-
ctx->kv_self.cells[i].seq_id.clear();
|
14093
|
-
}
|
14094
15435
|
}
|
14095
15436
|
|
14096
15437
|
const size_t nread = inp - src;
|
14097
|
-
const size_t max_size =
|
15438
|
+
const size_t max_size = llama_state_get_size(ctx);
|
14098
15439
|
|
14099
15440
|
GGML_ASSERT(nread <= max_size);
|
14100
15441
|
|
14101
15442
|
return nread;
|
14102
15443
|
}
|
14103
15444
|
|
14104
|
-
static bool
|
15445
|
+
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
14105
15446
|
llama_file file(path_session, "rb");
|
14106
15447
|
|
14107
15448
|
// sanity checks
|
@@ -14139,7 +15480,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
14139
15480
|
// restore the context state
|
14140
15481
|
{
|
14141
15482
|
const size_t n_state_size_cur = file.size - file.tell();
|
14142
|
-
const size_t n_state_size_max =
|
15483
|
+
const size_t n_state_size_max = llama_state_get_size(ctx);
|
14143
15484
|
|
14144
15485
|
if (n_state_size_cur > n_state_size_max) {
|
14145
15486
|
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
@@ -14149,22 +15490,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
14149
15490
|
std::vector<uint8_t> state_data(n_state_size_max);
|
14150
15491
|
file.read_raw(state_data.data(), n_state_size_cur);
|
14151
15492
|
|
14152
|
-
|
15493
|
+
llama_state_set_data(ctx, state_data.data());
|
14153
15494
|
}
|
14154
15495
|
|
14155
15496
|
return true;
|
14156
15497
|
}
|
14157
15498
|
|
14158
|
-
bool
|
15499
|
+
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
14159
15500
|
try {
|
14160
|
-
return
|
15501
|
+
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
14161
15502
|
} catch (const std::exception & err) {
|
14162
15503
|
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
14163
15504
|
return false;
|
14164
15505
|
}
|
14165
15506
|
}
|
14166
15507
|
|
14167
|
-
bool
|
15508
|
+
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
14168
15509
|
llama_file file(path_session, "wb");
|
14169
15510
|
|
14170
15511
|
file.write_u32(LLAMA_SESSION_MAGIC);
|
@@ -14178,11 +15519,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
14178
15519
|
|
14179
15520
|
// save the context state using stream saving
|
14180
15521
|
llama_data_file_context data_ctx(&file);
|
14181
|
-
|
15522
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
14182
15523
|
|
14183
15524
|
return true;
|
14184
15525
|
}
|
14185
15526
|
|
15527
|
+
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15528
|
+
try {
|
15529
|
+
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
|
15530
|
+
} catch (const std::exception & err) {
|
15531
|
+
LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
|
15532
|
+
return false;
|
15533
|
+
}
|
15534
|
+
}
|
15535
|
+
|
15536
|
+
size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
|
15537
|
+
// save the size of size_t as a uint32_t for safety check
|
15538
|
+
const size_t size_t_size_size = sizeof(uint32_t);
|
15539
|
+
|
15540
|
+
// other values
|
15541
|
+
const size_t s_cell_count_size = sizeof(uint32_t);
|
15542
|
+
const size_t s_layer_count_size = sizeof(uint32_t);
|
15543
|
+
const size_t n_embd_v_gqa_size = sizeof(uint32_t);
|
15544
|
+
|
15545
|
+
size_t s_cell_count = 0;
|
15546
|
+
size_t s_cell_data_size = 0;
|
15547
|
+
const auto & kv_self = ctx->kv_self;
|
15548
|
+
const auto & hparams = ctx->model.hparams;
|
15549
|
+
|
15550
|
+
const uint32_t n_layer = hparams.n_layer;
|
15551
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15552
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15553
|
+
|
15554
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
15555
|
+
const auto & cell = kv_self.cells[i];
|
15556
|
+
if (cell.seq_id.count(seq_id) > 0) {
|
15557
|
+
++s_cell_count;
|
15558
|
+
s_cell_data_size += sizeof(llama_pos);
|
15559
|
+
}
|
15560
|
+
}
|
15561
|
+
|
15562
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15563
|
+
// types of keys and values
|
15564
|
+
s_cell_data_size += sizeof(int32_t) * 2;
|
15565
|
+
// k_size_row and v_size_el values of layer
|
15566
|
+
s_cell_data_size += sizeof(size_t) * 2;
|
15567
|
+
|
15568
|
+
// keys
|
15569
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15570
|
+
s_cell_data_size += k_size_row * s_cell_count;
|
15571
|
+
|
15572
|
+
// values (transposed)
|
15573
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15574
|
+
s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
|
15575
|
+
}
|
15576
|
+
|
15577
|
+
const size_t s_total = (
|
15578
|
+
size_t_size_size +
|
15579
|
+
s_cell_count_size +
|
15580
|
+
s_layer_count_size +
|
15581
|
+
n_embd_v_gqa_size +
|
15582
|
+
s_cell_data_size
|
15583
|
+
);
|
15584
|
+
|
15585
|
+
return s_total;
|
15586
|
+
}
|
15587
|
+
|
15588
|
+
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
15589
|
+
const auto & kv_self = ctx->kv_self;
|
15590
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15591
|
+
|
15592
|
+
// Save the size of size_t as a uint32_t for safety check
|
15593
|
+
const uint32_t size_t_size = sizeof(size_t);
|
15594
|
+
data_ctx.write(&size_t_size, sizeof(size_t_size));
|
15595
|
+
|
15596
|
+
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
15597
|
+
uint32_t cell_count = 0;
|
15598
|
+
|
15599
|
+
// Count the number of cells with the specified seq_id
|
15600
|
+
// Find all the ranges of cells with this seq id
|
15601
|
+
{
|
15602
|
+
uint32_t cell_range_begin = kv_self.size;
|
15603
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
15604
|
+
const auto & cell = kv_self.cells[i];
|
15605
|
+
if (cell.has_seq_id(seq_id)) {
|
15606
|
+
++cell_count;
|
15607
|
+
if (cell_range_begin == kv_self.size) {
|
15608
|
+
cell_range_begin = i;
|
15609
|
+
}
|
15610
|
+
}
|
15611
|
+
else {
|
15612
|
+
if (cell_range_begin != kv_self.size) {
|
15613
|
+
cell_ranges.push_back({ cell_range_begin, i });
|
15614
|
+
cell_range_begin = kv_self.size;
|
15615
|
+
}
|
15616
|
+
}
|
15617
|
+
}
|
15618
|
+
if (cell_range_begin != kv_self.size) {
|
15619
|
+
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
15620
|
+
}
|
15621
|
+
|
15622
|
+
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
15623
|
+
uint32_t cell_count_check = 0;
|
15624
|
+
for (const auto & range : cell_ranges) {
|
15625
|
+
cell_count_check += range.second - range.first;
|
15626
|
+
}
|
15627
|
+
GGML_ASSERT(cell_count == cell_count_check);
|
15628
|
+
}
|
15629
|
+
|
15630
|
+
// Write the cell count
|
15631
|
+
data_ctx.write(&cell_count, sizeof(cell_count));
|
15632
|
+
|
15633
|
+
const auto & hparams = ctx->model.hparams;
|
15634
|
+
const uint32_t n_layer = hparams.n_layer;
|
15635
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15636
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15637
|
+
|
15638
|
+
// Write the layer count
|
15639
|
+
data_ctx.write(&n_layer, sizeof(n_layer));
|
15640
|
+
|
15641
|
+
// Write n_embd_v_gqa
|
15642
|
+
data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
|
15643
|
+
|
15644
|
+
// Iterate the ranges and write all the pos (this is the token position in the prompt)
|
15645
|
+
for (const auto & range : cell_ranges) {
|
15646
|
+
for (uint32_t i = range.first; i < range.second; ++i) {
|
15647
|
+
const auto & cell = kv_self.cells[i];
|
15648
|
+
data_ctx.write(&cell.pos, sizeof(cell.pos));
|
15649
|
+
}
|
15650
|
+
}
|
15651
|
+
|
15652
|
+
// Iterate and write all the keys first, each row is a cell
|
15653
|
+
// Get whole range at a time
|
15654
|
+
std::vector<uint8_t> tmp_buf;
|
15655
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15656
|
+
// Write key type
|
15657
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
15658
|
+
data_ctx.write(&k_type_i, sizeof(k_type_i));
|
15659
|
+
|
15660
|
+
// Write row size of key
|
15661
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15662
|
+
data_ctx.write(&k_size_row, sizeof(k_size_row));
|
15663
|
+
|
15664
|
+
// Read each range of cells of k_size length each into tmp_buf and write out
|
15665
|
+
for (const auto & range : cell_ranges) {
|
15666
|
+
const size_t range_size = range.second - range.first;
|
15667
|
+
tmp_buf.resize(range_size * k_size_row);
|
15668
|
+
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
15669
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
15670
|
+
}
|
15671
|
+
}
|
15672
|
+
|
15673
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
15674
|
+
const uint32_t kv_size = kv_self.size;
|
15675
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15676
|
+
// Write value type
|
15677
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
15678
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
15679
|
+
|
15680
|
+
// Write element size
|
15681
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15682
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
15683
|
+
|
15684
|
+
// For each row, we get the element values of each cell
|
15685
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
15686
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
15687
|
+
for (const auto & range : cell_ranges) {
|
15688
|
+
const size_t range_size = range.second - range.first;
|
15689
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
15690
|
+
tmp_buf.resize(range_size * v_size_el);
|
15691
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
15692
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
15693
|
+
}
|
15694
|
+
}
|
15695
|
+
}
|
15696
|
+
|
15697
|
+
return data_ctx.get_size_written();
|
15698
|
+
}
|
15699
|
+
|
15700
|
+
size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
|
15701
|
+
llama_data_buffer_context data_ctx(dst);
|
15702
|
+
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
15703
|
+
}
|
15704
|
+
|
15705
|
+
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
15706
|
+
auto & kv_self = ctx->kv_self;
|
15707
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
15708
|
+
|
15709
|
+
// Wipe the slot
|
15710
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15711
|
+
|
15712
|
+
const uint8_t * inp = src;
|
15713
|
+
|
15714
|
+
// Read size of size_t
|
15715
|
+
uint32_t size_t_size;
|
15716
|
+
memcpy(&size_t_size, inp, sizeof(size_t_size));
|
15717
|
+
inp += sizeof(size_t_size);
|
15718
|
+
if (size_t_size != sizeof(size_t)) {
|
15719
|
+
LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
|
15720
|
+
return 0;
|
15721
|
+
}
|
15722
|
+
|
15723
|
+
// Read the cell count
|
15724
|
+
uint32_t cell_count;
|
15725
|
+
memcpy(&cell_count, inp, sizeof(cell_count));
|
15726
|
+
inp += sizeof(cell_count);
|
15727
|
+
|
15728
|
+
// Read the layer count
|
15729
|
+
uint32_t n_layer_ref;
|
15730
|
+
memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
|
15731
|
+
inp += sizeof(n_layer_ref);
|
15732
|
+
|
15733
|
+
// Read n_embd_v_gqa
|
15734
|
+
uint32_t n_embd_v_gqa_ref;
|
15735
|
+
memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
|
15736
|
+
inp += sizeof(n_embd_v_gqa_ref);
|
15737
|
+
|
15738
|
+
// Sanity check model compatibility
|
15739
|
+
const auto & hparams = ctx->model.hparams;
|
15740
|
+
const uint32_t n_layer = hparams.n_layer;
|
15741
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
15742
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
15743
|
+
if (n_layer != n_layer_ref) {
|
15744
|
+
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
|
15745
|
+
return 0;
|
15746
|
+
}
|
15747
|
+
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
|
15748
|
+
LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
|
15749
|
+
return 0;
|
15750
|
+
}
|
15751
|
+
|
15752
|
+
// Allocate the new cells for the slot
|
15753
|
+
if (cell_count) {
|
15754
|
+
llama_batch batch = llama_batch_init(cell_count, 0, 1);
|
15755
|
+
batch.n_tokens = cell_count;
|
15756
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
15757
|
+
llama_pos pos;
|
15758
|
+
memcpy(&pos, inp, sizeof(pos));
|
15759
|
+
inp += sizeof(pos);
|
15760
|
+
|
15761
|
+
batch.pos[i] = pos;
|
15762
|
+
batch.n_seq_id[i] = 1;
|
15763
|
+
batch.seq_id[i][0] = dest_seq_id;
|
15764
|
+
}
|
15765
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
15766
|
+
llama_batch_free(batch);
|
15767
|
+
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
15768
|
+
return 0;
|
15769
|
+
}
|
15770
|
+
|
15771
|
+
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
15772
|
+
// Assume that this is one contiguous block of cells
|
15773
|
+
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
|
15774
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
|
15775
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
15776
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
|
15777
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
|
15778
|
+
|
15779
|
+
// Cleanup
|
15780
|
+
llama_batch_free(batch);
|
15781
|
+
}
|
15782
|
+
|
15783
|
+
const uint32_t kv_size = kv_self.size;
|
15784
|
+
const uint32_t kv_head = kv_self.head;
|
15785
|
+
|
15786
|
+
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
|
15787
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15788
|
+
// Read type of key
|
15789
|
+
int32_t k_type_i_ref;
|
15790
|
+
memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
|
15791
|
+
inp += sizeof(k_type_i_ref);
|
15792
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
15793
|
+
if (k_type_i != k_type_i_ref) {
|
15794
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15795
|
+
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
15796
|
+
return 0;
|
15797
|
+
}
|
15798
|
+
|
15799
|
+
// Read row size of key
|
15800
|
+
size_t k_size_row_ref;
|
15801
|
+
memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
|
15802
|
+
inp += sizeof(k_size_row_ref);
|
15803
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
15804
|
+
if (k_size_row != k_size_row_ref) {
|
15805
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15806
|
+
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
|
15807
|
+
return 0;
|
15808
|
+
}
|
15809
|
+
|
15810
|
+
if (cell_count) {
|
15811
|
+
// Read and set the keys for the whole cell range
|
15812
|
+
ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
|
15813
|
+
inp += cell_count * k_size_row;
|
15814
|
+
}
|
15815
|
+
}
|
15816
|
+
|
15817
|
+
// For each layer, read the values for each cell (transposed)
|
15818
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
15819
|
+
// Read type of value
|
15820
|
+
int32_t v_type_i_ref;
|
15821
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
15822
|
+
inp += sizeof(v_type_i_ref);
|
15823
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
15824
|
+
if (v_type_i != v_type_i_ref) {
|
15825
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15826
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
15827
|
+
return 0;
|
15828
|
+
}
|
15829
|
+
|
15830
|
+
// Read element size of value
|
15831
|
+
size_t v_size_el_ref;
|
15832
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
15833
|
+
inp += sizeof(v_size_el_ref);
|
15834
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
15835
|
+
if (v_size_el != v_size_el_ref) {
|
15836
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
15837
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
15838
|
+
return 0;
|
15839
|
+
}
|
15840
|
+
|
15841
|
+
if (cell_count) {
|
15842
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
15843
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
15844
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
15845
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
15846
|
+
inp += cell_count * v_size_el;
|
15847
|
+
}
|
15848
|
+
}
|
15849
|
+
}
|
15850
|
+
|
15851
|
+
const size_t nread = inp - src;
|
15852
|
+
return nread;
|
15853
|
+
}
|
15854
|
+
|
15855
|
+
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
15856
|
+
llama_file file(filepath, "wb");
|
15857
|
+
|
15858
|
+
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
|
15859
|
+
file.write_u32(LLAMA_STATE_SEQ_VERSION);
|
15860
|
+
|
15861
|
+
// save the prompt
|
15862
|
+
file.write_u32((uint32_t)n_token_count);
|
15863
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
15864
|
+
|
15865
|
+
// save the context state using stream saving
|
15866
|
+
llama_data_file_context data_ctx(&file);
|
15867
|
+
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
15868
|
+
|
15869
|
+
const size_t res = file.tell();
|
15870
|
+
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
|
15871
|
+
return res;
|
15872
|
+
}
|
15873
|
+
|
15874
|
+
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15875
|
+
llama_file file(filepath, "rb");
|
15876
|
+
|
15877
|
+
// version checks
|
15878
|
+
{
|
15879
|
+
const uint32_t magic = file.read_u32();
|
15880
|
+
const uint32_t version = file.read_u32();
|
15881
|
+
|
15882
|
+
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
|
15883
|
+
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
|
15884
|
+
return 0;
|
15885
|
+
}
|
15886
|
+
}
|
15887
|
+
|
15888
|
+
// load the prompt
|
15889
|
+
{
|
15890
|
+
const uint32_t n_token_count = file.read_u32();
|
15891
|
+
|
15892
|
+
if (n_token_count > n_token_capacity) {
|
15893
|
+
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
15894
|
+
return 0;
|
15895
|
+
}
|
15896
|
+
|
15897
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
15898
|
+
*n_token_count_out = n_token_count;
|
15899
|
+
}
|
15900
|
+
|
15901
|
+
// restore the context state
|
15902
|
+
{
|
15903
|
+
const size_t state_size = file.size - file.tell();
|
15904
|
+
std::vector<uint8_t> state_data(state_size);
|
15905
|
+
file.read_raw(state_data.data(), state_size);
|
15906
|
+
const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
|
15907
|
+
if (!nread) {
|
15908
|
+
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
15909
|
+
return 0;
|
15910
|
+
}
|
15911
|
+
GGML_ASSERT(nread <= state_size);
|
15912
|
+
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
|
15913
|
+
}
|
15914
|
+
|
15915
|
+
return file.tell();
|
15916
|
+
}
|
15917
|
+
|
15918
|
+
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
15919
|
+
try {
|
15920
|
+
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
|
15921
|
+
} catch (const std::exception & err) {
|
15922
|
+
LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
|
15923
|
+
return 0;
|
15924
|
+
}
|
15925
|
+
}
|
15926
|
+
|
15927
|
+
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15928
|
+
try {
|
15929
|
+
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
|
15930
|
+
} catch (const std::exception & err) {
|
15931
|
+
LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
|
15932
|
+
return 0;
|
15933
|
+
}
|
15934
|
+
}
|
15935
|
+
|
14186
15936
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
14187
15937
|
ctx->cparams.n_threads = n_threads;
|
14188
15938
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
@@ -14296,11 +16046,41 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
14296
16046
|
}
|
14297
16047
|
|
14298
16048
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
14299
|
-
|
14300
|
-
|
16049
|
+
int32_t j = -1;
|
14301
16050
|
llama_synchronize(ctx);
|
14302
16051
|
|
14303
|
-
|
16052
|
+
try {
|
16053
|
+
if (ctx->logits == nullptr) {
|
16054
|
+
throw std::runtime_error("no logits");
|
16055
|
+
}
|
16056
|
+
|
16057
|
+
if (i < 0) {
|
16058
|
+
j = ctx->n_outputs + i;
|
16059
|
+
if (j < 0) {
|
16060
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16061
|
+
}
|
16062
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
16063
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16064
|
+
} else {
|
16065
|
+
j = ctx->output_ids[i];
|
16066
|
+
}
|
16067
|
+
|
16068
|
+
if (j < 0) {
|
16069
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
16070
|
+
}
|
16071
|
+
if (j >= ctx->n_outputs) {
|
16072
|
+
// This should not happen
|
16073
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
16074
|
+
}
|
16075
|
+
|
16076
|
+
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
16077
|
+
} catch (const std::exception & err) {
|
16078
|
+
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
16079
|
+
#ifndef NDEBUG
|
16080
|
+
GGML_ASSERT(false);
|
16081
|
+
#endif
|
16082
|
+
return nullptr;
|
16083
|
+
}
|
14304
16084
|
}
|
14305
16085
|
|
14306
16086
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
@@ -14310,9 +16090,42 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
14310
16090
|
}
|
14311
16091
|
|
14312
16092
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
16093
|
+
int32_t j = -1;
|
16094
|
+
|
14313
16095
|
llama_synchronize(ctx);
|
14314
16096
|
|
14315
|
-
|
16097
|
+
try {
|
16098
|
+
if (ctx->embd == nullptr) {
|
16099
|
+
throw std::runtime_error("no embeddings");
|
16100
|
+
}
|
16101
|
+
|
16102
|
+
if (i < 0) {
|
16103
|
+
j = ctx->n_outputs + i;
|
16104
|
+
if (j < 0) {
|
16105
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16106
|
+
}
|
16107
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
16108
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16109
|
+
} else {
|
16110
|
+
j = ctx->output_ids[i];
|
16111
|
+
}
|
16112
|
+
|
16113
|
+
if (j < 0) {
|
16114
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
16115
|
+
}
|
16116
|
+
if (j >= ctx->n_outputs) {
|
16117
|
+
// This should not happen
|
16118
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
16119
|
+
}
|
16120
|
+
|
16121
|
+
return ctx->embd + j*ctx->model.hparams.n_embd;
|
16122
|
+
} catch (const std::exception & err) {
|
16123
|
+
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
16124
|
+
#ifndef NDEBUG
|
16125
|
+
GGML_ASSERT(false);
|
16126
|
+
#endif
|
16127
|
+
return nullptr;
|
16128
|
+
}
|
14316
16129
|
}
|
14317
16130
|
|
14318
16131
|
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
@@ -14349,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
|
14349
16162
|
return model->vocab.special_eos_id;
|
14350
16163
|
}
|
14351
16164
|
|
16165
|
+
llama_token llama_token_cls(const struct llama_model * model) {
|
16166
|
+
return model->vocab.special_cls_id;
|
16167
|
+
}
|
16168
|
+
|
16169
|
+
llama_token llama_token_sep(const struct llama_model * model) {
|
16170
|
+
return model->vocab.special_sep_id;
|
16171
|
+
}
|
16172
|
+
|
14352
16173
|
llama_token llama_token_nl(const struct llama_model * model) {
|
14353
16174
|
return model->vocab.linefeed_id;
|
14354
16175
|
}
|
@@ -14383,9 +16204,9 @@ int32_t llama_tokenize(
|
|
14383
16204
|
int32_t text_len,
|
14384
16205
|
llama_token * tokens,
|
14385
16206
|
int32_t n_tokens_max,
|
14386
|
-
bool
|
14387
|
-
bool
|
14388
|
-
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len),
|
16207
|
+
bool add_special,
|
16208
|
+
bool parse_special) {
|
16209
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
|
14389
16210
|
|
14390
16211
|
if (n_tokens_max < (int) res.size()) {
|
14391
16212
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -14602,6 +16423,55 @@ static int32_t llama_chat_apply_template_internal(
|
|
14602
16423
|
ss << message->content << "</s>";
|
14603
16424
|
}
|
14604
16425
|
}
|
16426
|
+
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
|
16427
|
+
// openchat/openchat-3.5-0106,
|
16428
|
+
for (auto message : chat) {
|
16429
|
+
std::string role(message->role);
|
16430
|
+
if (role == "system") {
|
16431
|
+
ss << message->content << "<|end_of_turn|>";
|
16432
|
+
} else {
|
16433
|
+
role[0] = toupper(role[0]);
|
16434
|
+
ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
|
16435
|
+
}
|
16436
|
+
}
|
16437
|
+
if (add_ass) {
|
16438
|
+
ss << "GPT4 Correct Assistant:";
|
16439
|
+
}
|
16440
|
+
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
|
16441
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
16442
|
+
for (auto message : chat) {
|
16443
|
+
std::string role(message->role);
|
16444
|
+
if (role == "system") {
|
16445
|
+
// Orca-Vicuna variant uses a system prefix
|
16446
|
+
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
|
16447
|
+
ss << "SYSTEM: " << message->content << "\n";
|
16448
|
+
} else {
|
16449
|
+
ss << message->content << "\n\n";
|
16450
|
+
}
|
16451
|
+
} else if (role == "user") {
|
16452
|
+
ss << "USER: " << message->content << "\n";
|
16453
|
+
} else if (role == "assistant") {
|
16454
|
+
ss << "ASSISTANT: " << message->content << "</s>\n";
|
16455
|
+
}
|
16456
|
+
}
|
16457
|
+
if (add_ass) {
|
16458
|
+
ss << "ASSISTANT:";
|
16459
|
+
}
|
16460
|
+
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
|
16461
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
16462
|
+
for (auto message : chat) {
|
16463
|
+
std::string role(message->role);
|
16464
|
+
if (role == "system") {
|
16465
|
+
ss << message->content;
|
16466
|
+
} else if (role == "user") {
|
16467
|
+
ss << "### Instruction:\n" << message->content << "\n";
|
16468
|
+
} else if (role == "assistant") {
|
16469
|
+
ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
|
16470
|
+
}
|
16471
|
+
}
|
16472
|
+
if (add_ass) {
|
16473
|
+
ss << "### Response:\n";
|
16474
|
+
}
|
14605
16475
|
} else {
|
14606
16476
|
// template not supported
|
14607
16477
|
return -1;
|
@@ -14651,6 +16521,30 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
14651
16521
|
return res;
|
14652
16522
|
}
|
14653
16523
|
|
16524
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
16525
|
+
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
16526
|
+
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
16527
|
+
return strlen(split_path);
|
16528
|
+
}
|
16529
|
+
return 0;
|
16530
|
+
}
|
16531
|
+
|
16532
|
+
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
16533
|
+
std::string str_split_path(split_path);
|
16534
|
+
char postfix[32];
|
16535
|
+
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
16536
|
+
std::string str_postfix(postfix);
|
16537
|
+
|
16538
|
+
// check if dest ends with postfix
|
16539
|
+
int size_prefix = str_split_path.size() - str_postfix.size();
|
16540
|
+
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
16541
|
+
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
16542
|
+
return size_prefix;
|
16543
|
+
}
|
16544
|
+
|
16545
|
+
return 0;
|
16546
|
+
}
|
16547
|
+
|
14654
16548
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
14655
16549
|
struct llama_timings result = {
|
14656
16550
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|