llama_cpp 0.14.2 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
-
#ifdef
|
10
|
+
#ifdef GGML_USE_CUDA
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
@@ -52,12 +52,16 @@
|
|
52
52
|
#define NOMINMAX
|
53
53
|
#endif
|
54
54
|
#include <windows.h>
|
55
|
+
#ifndef PATH_MAX
|
56
|
+
#define PATH_MAX MAX_PATH
|
57
|
+
#endif
|
55
58
|
#include <io.h>
|
56
59
|
#endif
|
57
60
|
|
58
61
|
#include <algorithm>
|
59
62
|
#include <array>
|
60
63
|
#include <cassert>
|
64
|
+
#include <cctype>
|
61
65
|
#include <cfloat>
|
62
66
|
#include <cinttypes>
|
63
67
|
#include <climits>
|
@@ -68,7 +72,6 @@
|
|
68
72
|
#include <cstdio>
|
69
73
|
#include <cstring>
|
70
74
|
#include <ctime>
|
71
|
-
#include <cwctype>
|
72
75
|
#include <forward_list>
|
73
76
|
#include <fstream>
|
74
77
|
#include <functional>
|
@@ -192,6 +195,7 @@ enum llm_arch {
|
|
192
195
|
LLM_ARCH_LLAMA,
|
193
196
|
LLM_ARCH_FALCON,
|
194
197
|
LLM_ARCH_BAICHUAN,
|
198
|
+
LLM_ARCH_GROK,
|
195
199
|
LLM_ARCH_GPT2,
|
196
200
|
LLM_ARCH_GPTJ,
|
197
201
|
LLM_ARCH_GPTNEOX,
|
@@ -214,12 +218,15 @@ enum llm_arch {
|
|
214
218
|
LLM_ARCH_GEMMA,
|
215
219
|
LLM_ARCH_STARCODER2,
|
216
220
|
LLM_ARCH_MAMBA,
|
221
|
+
LLM_ARCH_XVERSE,
|
222
|
+
LLM_ARCH_COMMAND_R,
|
217
223
|
LLM_ARCH_UNKNOWN,
|
218
224
|
};
|
219
225
|
|
220
226
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
221
227
|
{ LLM_ARCH_LLAMA, "llama" },
|
222
228
|
{ LLM_ARCH_FALCON, "falcon" },
|
229
|
+
{ LLM_ARCH_GROK, "grok" },
|
223
230
|
{ LLM_ARCH_GPT2, "gpt2" },
|
224
231
|
{ LLM_ARCH_GPTJ, "gptj" },
|
225
232
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
@@ -243,6 +250,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
243
250
|
{ LLM_ARCH_GEMMA, "gemma" },
|
244
251
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
245
252
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
254
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
246
255
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
247
256
|
};
|
248
257
|
|
@@ -268,6 +277,7 @@ enum llm_kv {
|
|
268
277
|
LLM_KV_EXPERT_COUNT,
|
269
278
|
LLM_KV_EXPERT_USED_COUNT,
|
270
279
|
LLM_KV_POOLING_TYPE,
|
280
|
+
LLM_KV_LOGIT_SCALE,
|
271
281
|
|
272
282
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
273
283
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -287,6 +297,10 @@ enum llm_kv {
|
|
287
297
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
288
298
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
289
299
|
|
300
|
+
LLM_KV_SPLIT_NO,
|
301
|
+
LLM_KV_SPLIT_COUNT,
|
302
|
+
LLM_KV_SPLIT_TENSORS_COUNT,
|
303
|
+
|
290
304
|
LLM_KV_SSM_INNER_SIZE,
|
291
305
|
LLM_KV_SSM_CONV_KERNEL,
|
292
306
|
LLM_KV_SSM_STATE_SIZE,
|
@@ -332,6 +346,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
332
346
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
333
347
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
334
348
|
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
349
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
335
350
|
|
336
351
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
337
352
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -351,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
351
366
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
352
367
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
353
368
|
|
369
|
+
{ LLM_KV_SPLIT_NO, "split.no" },
|
370
|
+
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
371
|
+
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
|
372
|
+
|
354
373
|
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
355
374
|
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
356
375
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
@@ -407,9 +426,12 @@ enum llm_tensor {
|
|
407
426
|
LLM_TENSOR_FFN_DOWN,
|
408
427
|
LLM_TENSOR_FFN_UP,
|
409
428
|
LLM_TENSOR_FFN_ACT,
|
410
|
-
LLM_TENSOR_FFN_DOWN_EXP,
|
429
|
+
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
411
430
|
LLM_TENSOR_FFN_GATE_EXP,
|
412
431
|
LLM_TENSOR_FFN_UP_EXP,
|
432
|
+
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
433
|
+
LLM_TENSOR_FFN_GATE_EXPS,
|
434
|
+
LLM_TENSOR_FFN_UP_EXPS,
|
413
435
|
LLM_TENSOR_ATTN_Q_NORM,
|
414
436
|
LLM_TENSOR_ATTN_K_NORM,
|
415
437
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -444,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
444
466
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
445
467
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
446
468
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
469
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
470
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
471
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
447
472
|
},
|
448
473
|
},
|
449
474
|
{
|
@@ -479,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
479
504
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
480
505
|
},
|
481
506
|
},
|
507
|
+
{
|
508
|
+
LLM_ARCH_GROK,
|
509
|
+
{
|
510
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
511
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
512
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
513
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
514
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
515
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
516
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
517
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
518
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
519
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
520
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
521
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
522
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
523
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
524
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
525
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
526
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
527
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
528
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
529
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
530
|
+
},
|
531
|
+
},
|
482
532
|
{
|
483
533
|
LLM_ARCH_GPT2,
|
484
534
|
{
|
@@ -536,6 +586,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
536
586
|
{
|
537
587
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
538
588
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
589
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
539
590
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
540
591
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
541
592
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
@@ -543,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
543
594
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
544
595
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
545
596
|
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
597
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
598
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
599
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
546
600
|
},
|
547
601
|
},
|
548
602
|
{
|
@@ -838,6 +892,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
838
892
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
839
893
|
},
|
840
894
|
},
|
895
|
+
{
|
896
|
+
LLM_ARCH_XVERSE,
|
897
|
+
{
|
898
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
899
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
900
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
901
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
902
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
903
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
904
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
905
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
906
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
907
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
908
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
909
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
910
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
911
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
912
|
+
},
|
913
|
+
},
|
914
|
+
{
|
915
|
+
LLM_ARCH_COMMAND_R,
|
916
|
+
{
|
917
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
918
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
919
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
920
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
921
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
922
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
923
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
924
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
925
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
926
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
927
|
+
},
|
928
|
+
},
|
841
929
|
{
|
842
930
|
LLM_ARCH_UNKNOWN,
|
843
931
|
{
|
@@ -1010,7 +1098,7 @@ struct llama_file {
|
|
1010
1098
|
size_t size;
|
1011
1099
|
|
1012
1100
|
llama_file(const char * fname, const char * mode) {
|
1013
|
-
fp =
|
1101
|
+
fp = ggml_fopen(fname, mode);
|
1014
1102
|
if (fp == NULL) {
|
1015
1103
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1016
1104
|
}
|
@@ -1079,6 +1167,7 @@ struct llama_file {
|
|
1079
1167
|
}
|
1080
1168
|
}
|
1081
1169
|
};
|
1170
|
+
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1082
1171
|
|
1083
1172
|
struct llama_mmap {
|
1084
1173
|
void * addr;
|
@@ -1279,6 +1368,7 @@ struct llama_mmap {
|
|
1279
1368
|
}
|
1280
1369
|
#endif
|
1281
1370
|
};
|
1371
|
+
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
1282
1372
|
|
1283
1373
|
// Represents some region of memory being locked using mlock or VirtualLock;
|
1284
1374
|
// will automatically unlock on destruction.
|
@@ -1428,6 +1518,7 @@ struct llama_mlock {
|
|
1428
1518
|
static void raw_unlock(const void * addr, size_t len) {}
|
1429
1519
|
#endif
|
1430
1520
|
};
|
1521
|
+
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1431
1522
|
|
1432
1523
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1433
1524
|
std::vector<char> result(8, 0);
|
@@ -1447,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1447
1538
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
1448
1539
|
ggml_backend_buffer_type_t buft = nullptr;
|
1449
1540
|
|
1450
|
-
#if defined(
|
1541
|
+
#if defined(GGML_USE_CUDA)
|
1451
1542
|
// host buffers should only be used when data is expected to be copied to/from the GPU
|
1452
1543
|
if (host_buffer) {
|
1453
1544
|
buft = ggml_backend_cuda_host_buffer_type();
|
@@ -1477,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1477
1568
|
|
1478
1569
|
#ifdef GGML_USE_METAL
|
1479
1570
|
buft = ggml_backend_metal_buffer_type();
|
1480
|
-
#elif defined(
|
1571
|
+
#elif defined(GGML_USE_CUDA)
|
1481
1572
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1482
1573
|
#elif defined(GGML_USE_VULKAN)
|
1483
1574
|
buft = ggml_backend_vk_buffer_type(gpu);
|
@@ -1503,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1503
1594
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1504
1595
|
ggml_backend_buffer_type_t buft = nullptr;
|
1505
1596
|
|
1506
|
-
#ifdef
|
1597
|
+
#ifdef GGML_USE_CUDA
|
1507
1598
|
if (ggml_backend_cuda_get_device_count() > 1) {
|
1508
1599
|
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1509
1600
|
}
|
@@ -1524,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1524
1615
|
}
|
1525
1616
|
|
1526
1617
|
static size_t llama_get_device_count() {
|
1527
|
-
#if defined(
|
1618
|
+
#if defined(GGML_USE_CUDA)
|
1528
1619
|
return ggml_backend_cuda_get_device_count();
|
1529
1620
|
#elif defined(GGML_USE_SYCL)
|
1530
1621
|
return ggml_backend_sycl_get_device_count();
|
@@ -1536,7 +1627,7 @@ static size_t llama_get_device_count() {
|
|
1536
1627
|
}
|
1537
1628
|
|
1538
1629
|
static size_t llama_get_device_memory(int device) {
|
1539
|
-
#if defined(
|
1630
|
+
#if defined(GGML_USE_CUDA)
|
1540
1631
|
size_t total;
|
1541
1632
|
size_t free;
|
1542
1633
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
@@ -1597,9 +1688,11 @@ enum e_model {
|
|
1597
1688
|
MODEL_20B,
|
1598
1689
|
MODEL_30B,
|
1599
1690
|
MODEL_34B,
|
1691
|
+
MODEL_35B,
|
1600
1692
|
MODEL_40B,
|
1601
1693
|
MODEL_65B,
|
1602
1694
|
MODEL_70B,
|
1695
|
+
MODEL_314B,
|
1603
1696
|
MODEL_SMALL,
|
1604
1697
|
MODEL_MEDIUM,
|
1605
1698
|
MODEL_LARGE,
|
@@ -1643,6 +1736,7 @@ struct llama_hparams {
|
|
1643
1736
|
|
1644
1737
|
float f_clamp_kqv = 0.0f;
|
1645
1738
|
float f_max_alibi_bias = 0.0f;
|
1739
|
+
float f_logit_scale = 0.0f;
|
1646
1740
|
|
1647
1741
|
bool causal_attn = true;
|
1648
1742
|
bool need_kq_pos = false;
|
@@ -1716,6 +1810,7 @@ struct llama_cparams {
|
|
1716
1810
|
uint32_t n_ctx; // context size used during inference
|
1717
1811
|
uint32_t n_batch;
|
1718
1812
|
uint32_t n_ubatch;
|
1813
|
+
uint32_t n_seq_max;
|
1719
1814
|
uint32_t n_threads; // number of threads to use for generation
|
1720
1815
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1721
1816
|
|
@@ -1781,9 +1876,9 @@ struct llama_layer {
|
|
1781
1876
|
|
1782
1877
|
// ff MoE
|
1783
1878
|
struct ggml_tensor * ffn_gate_inp;
|
1784
|
-
struct ggml_tensor *
|
1785
|
-
struct ggml_tensor *
|
1786
|
-
struct ggml_tensor *
|
1879
|
+
struct ggml_tensor * ffn_gate_exps;
|
1880
|
+
struct ggml_tensor * ffn_down_exps;
|
1881
|
+
struct ggml_tensor * ffn_up_exps ;
|
1787
1882
|
|
1788
1883
|
// ff bias
|
1789
1884
|
struct ggml_tensor * ffn_down_b; // b2
|
@@ -1873,6 +1968,31 @@ struct llama_kv_cache {
|
|
1873
1968
|
}
|
1874
1969
|
};
|
1875
1970
|
|
1971
|
+
struct llama_control_vector {
|
1972
|
+
std::vector<struct ggml_tensor *> tensors; // per layer
|
1973
|
+
std::vector<struct ggml_context *> ctxs;
|
1974
|
+
std::vector<ggml_backend_buffer_t> bufs;
|
1975
|
+
|
1976
|
+
int32_t layer_start = -1;
|
1977
|
+
int32_t layer_end = -1;
|
1978
|
+
|
1979
|
+
ggml_tensor * tensor_for(int il) const {
|
1980
|
+
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
1981
|
+
return nullptr;
|
1982
|
+
}
|
1983
|
+
return tensors[il];
|
1984
|
+
}
|
1985
|
+
|
1986
|
+
~llama_control_vector() {
|
1987
|
+
for (struct ggml_context * ctx : ctxs) {
|
1988
|
+
ggml_free(ctx);
|
1989
|
+
}
|
1990
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
1991
|
+
ggml_backend_buffer_free(buf);
|
1992
|
+
}
|
1993
|
+
}
|
1994
|
+
};
|
1995
|
+
|
1876
1996
|
struct llama_vocab {
|
1877
1997
|
using id = int32_t;
|
1878
1998
|
using token = std::string;
|
@@ -1976,12 +2096,12 @@ struct llama_model {
|
|
1976
2096
|
// the model memory buffers for the tensor data
|
1977
2097
|
std::vector<ggml_backend_buffer_t> bufs;
|
1978
2098
|
|
1979
|
-
// model memory mapped
|
1980
|
-
|
2099
|
+
// model memory mapped files
|
2100
|
+
llama_mmaps mappings;
|
1981
2101
|
|
1982
2102
|
// objects representing data potentially being locked in memory
|
1983
|
-
|
1984
|
-
|
2103
|
+
llama_mlocks mlock_bufs;
|
2104
|
+
llama_mlocks mlock_mmaps;
|
1985
2105
|
|
1986
2106
|
// for quantize-stats only
|
1987
2107
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
@@ -1994,6 +2114,11 @@ struct llama_model {
|
|
1994
2114
|
ggml_free(ctx);
|
1995
2115
|
}
|
1996
2116
|
for (ggml_backend_buffer_t buf : bufs) {
|
2117
|
+
#ifdef GGML_USE_CUDA
|
2118
|
+
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
2119
|
+
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
2120
|
+
}
|
2121
|
+
#endif
|
1997
2122
|
ggml_backend_buffer_free(buf);
|
1998
2123
|
}
|
1999
2124
|
}
|
@@ -2008,10 +2133,6 @@ struct llama_context {
|
|
2008
2133
|
ggml_backend_free(backend);
|
2009
2134
|
}
|
2010
2135
|
|
2011
|
-
#ifdef GGML_USE_VULKAN
|
2012
|
-
ggml_vk_free_cpu_assist();
|
2013
|
-
#endif
|
2014
|
-
|
2015
2136
|
ggml_backend_buffer_free(buf_output);
|
2016
2137
|
}
|
2017
2138
|
|
@@ -2048,20 +2169,20 @@ struct llama_context {
|
|
2048
2169
|
// host buffer for the model output (logits and embeddings)
|
2049
2170
|
ggml_backend_buffer_t buf_output = nullptr;
|
2050
2171
|
|
2051
|
-
// decode output (2-dimensional array: [
|
2052
|
-
size_t
|
2053
|
-
float * logits
|
2172
|
+
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
2173
|
+
size_t logits_size = 0; // capacity (of floats) for logits
|
2174
|
+
float * logits = nullptr;
|
2175
|
+
|
2176
|
+
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
|
+
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2054
2179
|
|
2055
|
-
#ifndef NDEBUG
|
2056
|
-
// guard against access to unset logits
|
2057
|
-
std::vector<bool> logits_valid;
|
2058
|
-
#endif
|
2059
2180
|
bool logits_all = false;
|
2060
2181
|
|
2061
|
-
// embeddings output (2-dimensional array: [
|
2182
|
+
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
2062
2183
|
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
2063
|
-
size_t
|
2064
|
-
float * embd
|
2184
|
+
size_t embd_size = 0; // capacity (of floats) for embeddings
|
2185
|
+
float * embd = nullptr;
|
2065
2186
|
|
2066
2187
|
// sequence embeddings output (map of [n_embd] vectors)
|
2067
2188
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
@@ -2078,14 +2199,18 @@ struct llama_context {
|
|
2078
2199
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
2079
2200
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
2080
2201
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2202
|
+
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2081
2203
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2082
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [
|
2204
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2083
2205
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2084
2206
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2085
2207
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
2086
2208
|
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
2087
|
-
struct ggml_tensor * inp_s_mask; // F32 [1,
|
2088
|
-
struct ggml_tensor * inp_s_seq; // I32 [
|
2209
|
+
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
2210
|
+
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
2211
|
+
|
2212
|
+
// control vectors
|
2213
|
+
struct llama_control_vector cvec;
|
2089
2214
|
|
2090
2215
|
#ifdef GGML_USE_MPI
|
2091
2216
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2737,6 +2862,8 @@ namespace GGUFMeta {
|
|
2737
2862
|
};
|
2738
2863
|
}
|
2739
2864
|
|
2865
|
+
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
2866
|
+
|
2740
2867
|
struct llama_model_loader {
|
2741
2868
|
int n_kv = 0;
|
2742
2869
|
int n_tensors = 0;
|
@@ -2747,54 +2874,133 @@ struct llama_model_loader {
|
|
2747
2874
|
|
2748
2875
|
bool use_mmap = false;
|
2749
2876
|
|
2750
|
-
|
2877
|
+
llama_files files;
|
2751
2878
|
llama_ftype ftype;
|
2752
2879
|
llama_fver fver;
|
2753
2880
|
|
2754
|
-
|
2881
|
+
llama_mmaps mappings;
|
2882
|
+
|
2883
|
+
// Holds information on a model weight
|
2884
|
+
struct llama_tensor_weight {
|
2885
|
+
uint16_t idx; // source file index
|
2886
|
+
size_t offs; // tensor data offset in the original file
|
2887
|
+
|
2888
|
+
ggml_tensor * tensor;
|
2889
|
+
|
2890
|
+
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2891
|
+
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2892
|
+
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
2893
|
+
}
|
2894
|
+
};
|
2895
|
+
std::vector<llama_tensor_weight> weights;
|
2896
|
+
|
2755
2897
|
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
2756
2898
|
|
2757
|
-
struct gguf_context *
|
2758
|
-
|
2899
|
+
struct gguf_context * meta = NULL;
|
2900
|
+
std::vector<ggml_context *> contexts;
|
2759
2901
|
|
2760
2902
|
std::string arch_name;
|
2761
2903
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2762
2904
|
|
2763
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p)
|
2905
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
2764
2906
|
int trace = 0;
|
2765
2907
|
if (getenv("LLAMA_TRACE")) {
|
2766
2908
|
trace = atoi(getenv("LLAMA_TRACE"));
|
2767
2909
|
}
|
2768
2910
|
|
2769
|
-
struct gguf_init_params params = {
|
2770
|
-
/*.no_alloc = */ true,
|
2771
|
-
/*.ctx = */ &ctx_meta,
|
2772
|
-
};
|
2773
|
-
|
2774
2911
|
if (param_overrides_p != nullptr) {
|
2775
2912
|
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
2776
2913
|
kv_overrides.insert({std::string(p->key), *p});
|
2777
2914
|
}
|
2778
2915
|
}
|
2779
2916
|
|
2780
|
-
|
2781
|
-
|
2917
|
+
struct ggml_context * ctx = NULL;
|
2918
|
+
struct gguf_init_params params = {
|
2919
|
+
/*.no_alloc = */ true,
|
2920
|
+
/*.ctx = */ &ctx,
|
2921
|
+
};
|
2922
|
+
|
2923
|
+
meta = gguf_init_from_file(fname.c_str(), params);
|
2924
|
+
if (!meta) {
|
2782
2925
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
2783
2926
|
}
|
2784
2927
|
|
2785
2928
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2786
2929
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2787
2930
|
|
2788
|
-
|
2789
|
-
|
2931
|
+
// Save tensors data offset of the main file.
|
2932
|
+
// For subsidiary files, `meta` tensor data offset must not be used,
|
2933
|
+
// so we build a unified tensors index for weights.
|
2934
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2935
|
+
weights.emplace_back(0, cur->name, meta, cur);
|
2936
|
+
}
|
2937
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
2938
|
+
contexts.emplace_back(ctx);
|
2939
|
+
|
2940
|
+
uint16_t n_split = 0;
|
2941
|
+
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
2942
|
+
|
2943
|
+
// Load additional GGML contexts
|
2944
|
+
if (n_split > 1) {
|
2945
|
+
uint16_t idx = 0;
|
2946
|
+
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
2947
|
+
if (idx != 0) {
|
2948
|
+
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
2949
|
+
}
|
2950
|
+
|
2951
|
+
char split_prefix[PATH_MAX] = {0};
|
2952
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
|
2953
|
+
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
2954
|
+
}
|
2955
|
+
|
2956
|
+
if (trace > 0) {
|
2957
|
+
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
2958
|
+
}
|
2959
|
+
|
2960
|
+
char split_path[PATH_MAX] = {0};
|
2961
|
+
for (idx = 1; idx < n_split; idx++) {
|
2962
|
+
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
2963
|
+
|
2964
|
+
struct gguf_init_params split_params = {
|
2965
|
+
/*.no_alloc = */ true,
|
2966
|
+
/*.ctx = */ &ctx,
|
2967
|
+
};
|
2968
|
+
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
|
2969
|
+
if (!ctx_gguf) {
|
2970
|
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
2971
|
+
}
|
2972
|
+
|
2973
|
+
// Save tensors data offset info of the shard.
|
2974
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2975
|
+
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
2976
|
+
}
|
2977
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
2978
|
+
contexts.emplace_back(ctx);
|
2979
|
+
|
2980
|
+
gguf_free(ctx_gguf);
|
2981
|
+
}
|
2982
|
+
|
2983
|
+
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
2984
|
+
|
2985
|
+
// sanity check
|
2986
|
+
{
|
2987
|
+
const int n_tensors_loaded = (int) weights.size();
|
2988
|
+
if (n_tensors != n_tensors_loaded) {
|
2989
|
+
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
2990
|
+
}
|
2991
|
+
}
|
2790
2992
|
|
2791
|
-
|
2993
|
+
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
2994
|
+
}
|
2995
|
+
|
2996
|
+
n_kv = gguf_get_n_kv(meta);
|
2997
|
+
n_tensors = weights.size();
|
2792
2998
|
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
n_elements += ggml_nelements(
|
2797
|
-
n_bytes += ggml_nbytes(
|
2999
|
+
fver = (enum llama_fver) gguf_get_version(meta);
|
3000
|
+
|
3001
|
+
for (auto & w : weights) {
|
3002
|
+
n_elements += ggml_nelements(w.tensor);
|
3003
|
+
n_bytes += ggml_nbytes(w.tensor);
|
2798
3004
|
}
|
2799
3005
|
|
2800
3006
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -2809,7 +3015,8 @@ struct llama_model_loader {
|
|
2809
3015
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2810
3016
|
|
2811
3017
|
for (int i = 0; i < n_tensors; i++) {
|
2812
|
-
|
3018
|
+
const ggml_tensor * tensor = weights.at(i).tensor;
|
3019
|
+
enum ggml_type type = tensor->type;
|
2813
3020
|
|
2814
3021
|
n_type[type]++;
|
2815
3022
|
|
@@ -2819,8 +3026,8 @@ struct llama_model_loader {
|
|
2819
3026
|
}
|
2820
3027
|
|
2821
3028
|
if (trace > 0) {
|
2822
|
-
|
2823
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(
|
3029
|
+
const uint16_t sid = weights.at(i).idx;
|
3030
|
+
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
2824
3031
|
}
|
2825
3032
|
}
|
2826
3033
|
|
@@ -2842,6 +3049,7 @@ struct llama_model_loader {
|
|
2842
3049
|
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2843
3050
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2844
3051
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
3052
|
+
case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
|
2845
3053
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2846
3054
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2847
3055
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
@@ -2856,22 +3064,23 @@ struct llama_model_loader {
|
|
2856
3064
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
2857
3065
|
|
2858
3066
|
{
|
2859
|
-
const int kid = gguf_find_key(
|
3067
|
+
const int kid = gguf_find_key(meta, "general.file_type");
|
2860
3068
|
if (kid >= 0) {
|
2861
|
-
ftype = (llama_ftype) gguf_get_val_u32(
|
3069
|
+
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
2862
3070
|
}
|
2863
3071
|
}
|
2864
3072
|
|
2865
3073
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
3074
|
+
|
2866
3075
|
for (int i = 0; i < n_kv; i++) {
|
2867
|
-
const char * name = gguf_get_key(
|
2868
|
-
const enum gguf_type type = gguf_get_kv_type(
|
3076
|
+
const char * name = gguf_get_key(meta, i);
|
3077
|
+
const enum gguf_type type = gguf_get_kv_type(meta, i);
|
2869
3078
|
const std::string type_name =
|
2870
3079
|
type == GGUF_TYPE_ARRAY
|
2871
|
-
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(
|
3080
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
|
2872
3081
|
: gguf_type_name(type);
|
2873
3082
|
|
2874
|
-
std::string value = gguf_kv_to_str(
|
3083
|
+
std::string value = gguf_kv_to_str(meta, i);
|
2875
3084
|
const size_t MAX_VALUE_LEN = 40;
|
2876
3085
|
if (value.size() > MAX_VALUE_LEN) {
|
2877
3086
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
@@ -2900,18 +3109,18 @@ struct llama_model_loader {
|
|
2900
3109
|
}
|
2901
3110
|
|
2902
3111
|
~llama_model_loader() {
|
2903
|
-
if (
|
2904
|
-
gguf_free(
|
3112
|
+
if (meta) {
|
3113
|
+
gguf_free(meta);
|
2905
3114
|
}
|
2906
|
-
|
2907
|
-
ggml_free(
|
3115
|
+
for (auto * ctx : contexts) {
|
3116
|
+
ggml_free(ctx);
|
2908
3117
|
}
|
2909
3118
|
}
|
2910
3119
|
|
2911
3120
|
template<typename T>
|
2912
3121
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2913
3122
|
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2914
|
-
const int kid = gguf_find_key(
|
3123
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
2915
3124
|
|
2916
3125
|
if (kid < 0) {
|
2917
3126
|
if (required) {
|
@@ -2921,7 +3130,7 @@ struct llama_model_loader {
|
|
2921
3130
|
}
|
2922
3131
|
|
2923
3132
|
struct GGUFMeta::ArrayInfo arr_info =
|
2924
|
-
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(
|
3133
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
2925
3134
|
|
2926
3135
|
|
2927
3136
|
result = arr_info.length;
|
@@ -2941,7 +3150,7 @@ struct llama_model_loader {
|
|
2941
3150
|
const struct llama_model_kv_override * override =
|
2942
3151
|
it != kv_overrides.end() ? &it->second : nullptr;
|
2943
3152
|
|
2944
|
-
const bool found = GGUFMeta::GKV<T>::set(
|
3153
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
2945
3154
|
|
2946
3155
|
if (required && !found) {
|
2947
3156
|
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
@@ -2964,28 +3173,57 @@ struct llama_model_loader {
|
|
2964
3173
|
}
|
2965
3174
|
|
2966
3175
|
const char * get_tensor_name(int i) const {
|
2967
|
-
return
|
3176
|
+
return weights.at(i).tensor->name;
|
3177
|
+
}
|
3178
|
+
|
3179
|
+
const llama_tensor_weight * get_weight(const char * name) const {
|
3180
|
+
for (const auto & weight : weights) {
|
3181
|
+
if (strcmp(name, weight.tensor->name) == 0) {
|
3182
|
+
return &weight;
|
3183
|
+
}
|
3184
|
+
}
|
3185
|
+
return nullptr;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
const llama_tensor_weight & require_weight(const char * name) const {
|
3189
|
+
const llama_tensor_weight * weight = get_weight(name);
|
3190
|
+
if (!weight) {
|
3191
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3192
|
+
}
|
3193
|
+
return *weight;
|
2968
3194
|
}
|
2969
3195
|
|
2970
3196
|
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
2971
|
-
|
3197
|
+
const auto * weight = get_weight(name);
|
3198
|
+
if (!weight) {
|
3199
|
+
return nullptr;
|
3200
|
+
}
|
3201
|
+
return weight->tensor;
|
3202
|
+
}
|
3203
|
+
|
3204
|
+
struct ggml_tensor * require_tensor_meta(const char * name) const {
|
3205
|
+
struct ggml_tensor * tensor = get_tensor_meta(name);
|
3206
|
+
if (!tensor) {
|
3207
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3208
|
+
}
|
3209
|
+
return tensor;
|
2972
3210
|
}
|
2973
3211
|
|
2974
3212
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
2975
3213
|
return get_tensor_meta(get_tensor_name(i));
|
2976
3214
|
}
|
2977
3215
|
|
2978
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor *
|
2979
|
-
struct ggml_tensor * tensor = ggml_dup_tensor(ctx,
|
2980
|
-
ggml_set_name(tensor, ggml_get_name(
|
3216
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3217
|
+
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3218
|
+
ggml_set_name(tensor, ggml_get_name(cur));
|
2981
3219
|
|
2982
3220
|
n_created++;
|
2983
3221
|
|
2984
3222
|
return tensor;
|
2985
3223
|
}
|
2986
3224
|
|
2987
|
-
struct ggml_tensor *
|
2988
|
-
struct ggml_tensor * cur =
|
3225
|
+
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
|
3226
|
+
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
2989
3227
|
|
2990
3228
|
if (cur == NULL) {
|
2991
3229
|
if (!required) {
|
@@ -2996,8 +3234,8 @@ struct llama_model_loader {
|
|
2996
3234
|
|
2997
3235
|
{
|
2998
3236
|
bool is_ok = true;
|
2999
|
-
for (size_t i = 0; i <
|
3000
|
-
if (ne[i] != cur->ne[i]) {
|
3237
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3238
|
+
if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
|
3001
3239
|
is_ok = false;
|
3002
3240
|
break;
|
3003
3241
|
}
|
@@ -3011,127 +3249,196 @@ struct llama_model_loader {
|
|
3011
3249
|
}
|
3012
3250
|
}
|
3013
3251
|
|
3014
|
-
return
|
3252
|
+
return cur;
|
3015
3253
|
}
|
3016
3254
|
|
3017
|
-
|
3018
|
-
|
3019
|
-
|
3255
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
3256
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3257
|
+
|
3258
|
+
if (cur == NULL) {
|
3259
|
+
return NULL;
|
3020
3260
|
}
|
3261
|
+
|
3262
|
+
return create_tensor_for(ctx, cur);
|
3021
3263
|
}
|
3022
3264
|
|
3023
|
-
|
3024
|
-
const
|
3265
|
+
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
3266
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3025
3267
|
|
3026
|
-
if (
|
3027
|
-
|
3268
|
+
if (cur == NULL) {
|
3269
|
+
return NULL;
|
3028
3270
|
}
|
3029
3271
|
|
3030
|
-
|
3031
|
-
|
3272
|
+
if (cur->type != base->type) {
|
3273
|
+
throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
|
3274
|
+
}
|
3032
3275
|
|
3033
|
-
|
3034
|
-
|
3035
|
-
|
3036
|
-
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
3276
|
+
std::array<int64_t, GGML_MAX_DIMS> dims;
|
3277
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3278
|
+
dims[i] = i < ne.size() ? ne[i] : 1;
|
3037
3279
|
}
|
3038
3280
|
|
3039
|
-
|
3040
|
-
|
3041
|
-
|
3042
|
-
|
3281
|
+
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
3282
|
+
dims[0], dims[1], dims[2], dims[3],
|
3283
|
+
cur->nb[1], cur->nb[2], cur->nb[3],
|
3284
|
+
offset);
|
3285
|
+
|
3286
|
+
ggml_set_name(tensor, name.c_str());
|
3287
|
+
|
3288
|
+
n_created++;
|
3289
|
+
|
3290
|
+
return tensor;
|
3291
|
+
}
|
3292
|
+
|
3293
|
+
void done_getting_tensors() const {
|
3294
|
+
if (n_created != n_tensors) {
|
3295
|
+
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
3043
3296
|
}
|
3297
|
+
}
|
3044
3298
|
|
3045
|
-
|
3046
|
-
|
3047
|
-
|
3299
|
+
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
|
3300
|
+
if (use_mmap) {
|
3301
|
+
mappings.reserve(files.size());
|
3302
|
+
mmaps_used.reserve(files.size());
|
3303
|
+
for (const auto & file : files) {
|
3304
|
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
3305
|
+
mmaps_used.emplace_back(mapping->size, 0);
|
3306
|
+
if (mlock_mmaps) {
|
3307
|
+
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
3308
|
+
mlock_mmap->init(mapping->addr);
|
3309
|
+
mlock_mmaps->emplace_back(std::move(mlock_mmap));
|
3310
|
+
}
|
3311
|
+
mappings.emplace_back(std::move(mapping));
|
3048
3312
|
}
|
3049
|
-
|
3313
|
+
}
|
3314
|
+
|
3315
|
+
// compute the total size of all tensors for progress reporting
|
3316
|
+
for (auto & w : weights) {
|
3317
|
+
size_data += ggml_nbytes(w.tensor);
|
3050
3318
|
}
|
3051
3319
|
}
|
3052
3320
|
|
3053
|
-
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
3054
|
-
GGML_ASSERT(
|
3321
|
+
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
|
3322
|
+
GGML_ASSERT(!mappings.empty());
|
3323
|
+
const auto & mapping = mappings.at(idx);
|
3055
3324
|
|
3056
3325
|
*first = mapping->size;
|
3057
3326
|
*last = 0;
|
3327
|
+
*addr = mapping->addr;
|
3058
3328
|
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
3059
|
-
|
3060
|
-
|
3061
|
-
|
3329
|
+
try {
|
3330
|
+
const auto * weight = get_weight(ggml_get_name(tensor));
|
3331
|
+
if (!weight) {
|
3332
|
+
continue;
|
3333
|
+
}
|
3334
|
+
if (weight->idx != idx) {
|
3335
|
+
continue;
|
3336
|
+
}
|
3337
|
+
*first = std::min(*first, weight->offs);
|
3338
|
+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
3339
|
+
} catch(...) {
|
3340
|
+
// the tensor is not in the model
|
3341
|
+
}
|
3062
3342
|
}
|
3063
3343
|
}
|
3064
3344
|
|
3065
3345
|
// for backwards compatibility, does not support ggml-backend
|
3066
3346
|
void load_data_for(struct ggml_tensor * cur) const {
|
3067
|
-
const
|
3347
|
+
const auto & w = require_weight(ggml_get_name(cur));
|
3068
3348
|
|
3069
|
-
if (use_mmap
|
3349
|
+
if (use_mmap) {
|
3350
|
+
const auto & mapping = mappings.at(w.idx);
|
3070
3351
|
if (cur->data == nullptr) {
|
3071
|
-
cur->data = (uint8_t *)mapping->addr + offs;
|
3352
|
+
cur->data = (uint8_t *)mapping->addr + w.offs;
|
3072
3353
|
} else {
|
3073
|
-
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
3354
|
+
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
3074
3355
|
}
|
3075
3356
|
} else {
|
3076
3357
|
GGML_ASSERT(cur->data != nullptr);
|
3077
|
-
|
3078
|
-
file.
|
3358
|
+
GGML_ASSERT(w.idx < files.size());
|
3359
|
+
const auto & file = files.at(w.idx);
|
3360
|
+
file->seek(w.offs, SEEK_SET);
|
3361
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3079
3362
|
}
|
3080
3363
|
}
|
3081
3364
|
|
3082
3365
|
size_t size_done = 0;
|
3083
3366
|
size_t size_data = 0;
|
3084
|
-
size_t
|
3085
|
-
size_t mmap_used_last = 0;
|
3367
|
+
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
3086
3368
|
|
3087
3369
|
// Returns false if cancelled by progress_callback
|
3088
|
-
bool load_all_data(
|
3089
|
-
|
3370
|
+
bool load_all_data(
|
3371
|
+
struct ggml_context * ctx,
|
3372
|
+
llama_buf_map & bufs_mmap,
|
3373
|
+
llama_mlocks * lmlocks,
|
3374
|
+
llama_progress_callback progress_callback,
|
3375
|
+
void * progress_callback_user_data) {
|
3376
|
+
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3090
3377
|
|
3091
3378
|
std::vector<no_init<uint8_t>> read_buf;
|
3092
|
-
|
3093
3379
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3380
|
+
const auto * weight = get_weight(ggml_get_name(cur));
|
3381
|
+
if (weight == nullptr) {
|
3382
|
+
// this can happen with split experts models
|
3383
|
+
continue;
|
3384
|
+
}
|
3385
|
+
|
3094
3386
|
if (progress_callback) {
|
3095
3387
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
3096
3388
|
return false;
|
3097
3389
|
}
|
3098
3390
|
}
|
3099
3391
|
|
3100
|
-
|
3392
|
+
size_t n_size = ggml_nbytes(cur);
|
3101
3393
|
|
3102
|
-
if (use_mmap
|
3394
|
+
if (use_mmap) {
|
3395
|
+
const auto & mapping = mappings.at(weight->idx);
|
3396
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3397
|
+
if (bufs_mmap.count(weight->idx)) {
|
3398
|
+
buf_mmap = bufs_mmap.at(weight->idx);
|
3399
|
+
}
|
3400
|
+
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3103
3401
|
if (buf_mmap && cur->data == nullptr) {
|
3104
|
-
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
3105
|
-
if (
|
3106
|
-
lmlock
|
3402
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
|
3403
|
+
if (lmlocks) {
|
3404
|
+
const auto & lmlock = lmlocks->at(weight->idx);
|
3405
|
+
lmlock->grow_to(weight->offs + ggml_nbytes(cur));
|
3107
3406
|
}
|
3108
|
-
|
3109
|
-
|
3407
|
+
|
3408
|
+
auto & mmap_used = mmaps_used[weight->idx];
|
3409
|
+
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3410
|
+
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3110
3411
|
} else {
|
3111
|
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0,
|
3412
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
|
3112
3413
|
}
|
3113
3414
|
} else {
|
3415
|
+
GGML_ASSERT(weight->idx < files.size());
|
3416
|
+
const auto & file = files.at(weight->idx);
|
3114
3417
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3115
|
-
file
|
3116
|
-
file
|
3418
|
+
file->seek(weight->offs, SEEK_SET);
|
3419
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3117
3420
|
} else {
|
3118
3421
|
read_buf.resize(ggml_nbytes(cur));
|
3119
|
-
file
|
3120
|
-
file
|
3121
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0,
|
3422
|
+
file->seek(weight->offs, SEEK_SET);
|
3423
|
+
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
3424
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3122
3425
|
}
|
3123
3426
|
}
|
3124
3427
|
|
3125
|
-
size_done +=
|
3428
|
+
size_done += n_size;
|
3126
3429
|
}
|
3127
3430
|
|
3128
3431
|
// check if this is the last call and do final cleanup
|
3129
3432
|
if (size_done >= size_data) {
|
3130
3433
|
// unmap offloaded tensors and metadata
|
3131
|
-
if (use_mmap
|
3132
|
-
|
3133
|
-
|
3134
|
-
mapping
|
3434
|
+
if (use_mmap) {
|
3435
|
+
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
|
3436
|
+
const auto & mmap_used = mmaps_used.at(idx);
|
3437
|
+
auto & mapping = mappings.at(idx);
|
3438
|
+
mapping->unmap_fragment(0, mmap_used.first);
|
3439
|
+
if (mmap_used.second != 0) {
|
3440
|
+
mapping->unmap_fragment(mmap_used.second, mapping->size);
|
3441
|
+
}
|
3135
3442
|
}
|
3136
3443
|
}
|
3137
3444
|
if (progress_callback) {
|
@@ -3204,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3204
3511
|
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
3205
3512
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
3206
3513
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
3514
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
|
3207
3515
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
3208
3516
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
3209
3517
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
@@ -3231,9 +3539,11 @@ static const char * llama_model_type_name(e_model type) {
|
|
3231
3539
|
case MODEL_20B: return "20B";
|
3232
3540
|
case MODEL_30B: return "30B";
|
3233
3541
|
case MODEL_34B: return "34B";
|
3542
|
+
case MODEL_35B: return "35B";
|
3234
3543
|
case MODEL_40B: return "40B";
|
3235
3544
|
case MODEL_65B: return "65B";
|
3236
3545
|
case MODEL_70B: return "70B";
|
3546
|
+
case MODEL_314B: return "314B";
|
3237
3547
|
case MODEL_SMALL: return "0.1B";
|
3238
3548
|
case MODEL_MEDIUM: return "0.4B";
|
3239
3549
|
case MODEL_LARGE: return "0.8B";
|
@@ -3263,7 +3573,7 @@ static void llm_load_hparams(
|
|
3263
3573
|
llama_model_loader & ml,
|
3264
3574
|
llama_model & model) {
|
3265
3575
|
auto & hparams = model.hparams;
|
3266
|
-
const gguf_context * ctx = ml.
|
3576
|
+
const gguf_context * ctx = ml.meta;
|
3267
3577
|
|
3268
3578
|
// get metadata as string
|
3269
3579
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -3372,6 +3682,15 @@ static void llm_load_hparams(
|
|
3372
3682
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3373
3683
|
}
|
3374
3684
|
} break;
|
3685
|
+
case LLM_ARCH_GROK:
|
3686
|
+
{
|
3687
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3688
|
+
|
3689
|
+
switch (hparams.n_layer) {
|
3690
|
+
case 64: model.type = e_model::MODEL_314B; break;
|
3691
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3692
|
+
}
|
3693
|
+
} break;
|
3375
3694
|
case LLM_ARCH_FALCON:
|
3376
3695
|
{
|
3377
3696
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3623,6 +3942,25 @@ static void llm_load_hparams(
|
|
3623
3942
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3624
3943
|
}
|
3625
3944
|
} break;
|
3945
|
+
case LLM_ARCH_XVERSE:
|
3946
|
+
{
|
3947
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3948
|
+
switch (hparams.n_layer) {
|
3949
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3950
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3951
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
3952
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3953
|
+
}
|
3954
|
+
} break;
|
3955
|
+
case LLM_ARCH_COMMAND_R:
|
3956
|
+
{
|
3957
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
3958
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3959
|
+
switch (hparams.n_layer) {
|
3960
|
+
case 40: model.type = e_model::MODEL_35B; break;
|
3961
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3962
|
+
}
|
3963
|
+
} break;
|
3626
3964
|
default: (void)0;
|
3627
3965
|
}
|
3628
3966
|
|
@@ -3644,7 +3982,7 @@ static void llm_load_vocab(
|
|
3644
3982
|
llama_model & model) {
|
3645
3983
|
auto & vocab = model.vocab;
|
3646
3984
|
|
3647
|
-
struct gguf_context * ctx = ml.
|
3985
|
+
struct gguf_context * ctx = ml.meta;
|
3648
3986
|
|
3649
3987
|
const auto kv = LLM_KV(model.arch);
|
3650
3988
|
|
@@ -3777,7 +4115,7 @@ static void llm_load_vocab(
|
|
3777
4115
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3778
4116
|
vocab.linefeed_id = vocab.special_pad_id;
|
3779
4117
|
} else {
|
3780
|
-
const std::vector<int> ids = llama_tokenize_internal(vocab, "\
|
4118
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
3781
4119
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
3782
4120
|
vocab.linefeed_id = ids[0];
|
3783
4121
|
}
|
@@ -3944,6 +4282,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3944
4282
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
3945
4283
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
3946
4284
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
4285
|
+
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
3947
4286
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3948
4287
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3949
4288
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
@@ -4009,6 +4348,7 @@ static bool llm_load_tensors(
|
|
4009
4348
|
|
4010
4349
|
const int64_t n_layer = hparams.n_layer;
|
4011
4350
|
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
4351
|
+
bool use_mmap_buffer = true;
|
4012
4352
|
|
4013
4353
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
4014
4354
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
@@ -4097,6 +4437,10 @@ static bool llm_load_tensors(
|
|
4097
4437
|
|
4098
4438
|
// create one context per buffer type
|
4099
4439
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4440
|
+
|
4441
|
+
// for moe merged tensors
|
4442
|
+
ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
|
4443
|
+
|
4100
4444
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4101
4445
|
for (auto & it : buft_layer_count) {
|
4102
4446
|
struct ggml_init_params params = {
|
@@ -4123,6 +4467,11 @@ static bool llm_load_tensors(
|
|
4123
4467
|
const int64_t n_vocab = hparams.n_vocab;
|
4124
4468
|
const int64_t n_vocab_type = hparams.n_vocab_type;
|
4125
4469
|
const int64_t n_ff = hparams.n_ff;
|
4470
|
+
const int64_t n_expert = hparams.n_expert;
|
4471
|
+
|
4472
|
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
4473
|
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
4474
|
+
}
|
4126
4475
|
|
4127
4476
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
4128
4477
|
|
@@ -4177,26 +4526,113 @@ static bool llm_load_tensors(
|
|
4177
4526
|
|
4178
4527
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4179
4528
|
|
4180
|
-
|
4181
|
-
|
4182
|
-
if (layer.ffn_gate_inp == nullptr) {
|
4183
|
-
GGML_ASSERT(hparams.n_expert == 0);
|
4184
|
-
GGML_ASSERT(hparams.n_expert_used == 0);
|
4185
|
-
|
4529
|
+
if (n_expert == 0) {
|
4186
4530
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4187
4531
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4188
4532
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4189
4533
|
} else {
|
4190
|
-
|
4191
|
-
|
4192
|
-
|
4193
|
-
|
4194
|
-
|
4195
|
-
layer.
|
4196
|
-
|
4197
|
-
|
4534
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4535
|
+
|
4536
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4537
|
+
if (layer.ffn_gate_exps) {
|
4538
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4539
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4540
|
+
} else {
|
4541
|
+
// merge split expert into a single tensor for compatibility with older models
|
4542
|
+
// requires disabling mmap
|
4543
|
+
use_mmap_buffer = false;
|
4544
|
+
|
4545
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4546
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4547
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4548
|
+
|
4549
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4550
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4551
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4552
|
+
|
4553
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4554
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4555
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4556
|
+
|
4557
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4558
|
+
// the individual experts are loaded into a view of the merged tensor
|
4559
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4560
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4561
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4562
|
+
}
|
4563
|
+
}
|
4564
|
+
}
|
4565
|
+
}
|
4566
|
+
} break;
|
4567
|
+
case LLM_ARCH_GROK:
|
4568
|
+
{
|
4569
|
+
if (n_expert == 0) {
|
4570
|
+
throw std::runtime_error("Grok model cannot have zero experts");
|
4571
|
+
}
|
4572
|
+
|
4573
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4574
|
+
|
4575
|
+
// output
|
4576
|
+
{
|
4577
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4578
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4579
|
+
// if output is NULL, init from the input tok embed
|
4580
|
+
if (model.output == NULL) {
|
4581
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4582
|
+
ml.n_created--; // artificial tensor
|
4583
|
+
ml.size_data += ggml_nbytes(model.output);
|
4584
|
+
}
|
4585
|
+
}
|
4586
|
+
|
4587
|
+
for (int i = 0; i < n_layer; ++i) {
|
4588
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4589
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4590
|
+
|
4591
|
+
auto & layer = model.layers[i];
|
4592
|
+
|
4593
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4594
|
+
|
4595
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4596
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4597
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4598
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4599
|
+
|
4600
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4601
|
+
|
4602
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4603
|
+
|
4604
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4605
|
+
|
4606
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4607
|
+
if (layer.ffn_gate_exps) {
|
4608
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4609
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4610
|
+
} else {
|
4611
|
+
// merge split expert into a single tensor for compatibility with older models
|
4612
|
+
// requires disabling mmap
|
4613
|
+
use_mmap_buffer = false;
|
4614
|
+
|
4615
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4616
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4617
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4618
|
+
|
4619
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4620
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4621
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4622
|
+
|
4623
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4624
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4625
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4626
|
+
|
4627
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4628
|
+
// the individual experts are loaded into a view of the merged tensor
|
4629
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4630
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4631
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4198
4632
|
}
|
4199
4633
|
}
|
4634
|
+
|
4635
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4200
4636
|
}
|
4201
4637
|
} break;
|
4202
4638
|
case LLM_ARCH_BAICHUAN:
|
@@ -4235,9 +4671,9 @@ static bool llm_load_tensors(
|
|
4235
4671
|
{
|
4236
4672
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4237
4673
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4674
|
+
|
4675
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4676
|
+
if (!model.output) {
|
4241
4677
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
4242
4678
|
ml.n_created--; // artificial tensor
|
4243
4679
|
ml.size_data += ggml_nbytes(model.output);
|
@@ -4253,10 +4689,8 @@ static bool llm_load_tensors(
|
|
4253
4689
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4254
4690
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4255
4691
|
|
4256
|
-
|
4257
|
-
|
4258
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
4259
|
-
}
|
4692
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
|
4693
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
|
4260
4694
|
|
4261
4695
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4262
4696
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -4436,16 +4870,19 @@ static bool llm_load_tensors(
|
|
4436
4870
|
case LLM_ARCH_MPT:
|
4437
4871
|
{
|
4438
4872
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4873
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
|
4439
4874
|
|
4440
4875
|
// output
|
4441
4876
|
{
|
4442
4877
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4443
4878
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
4444
4879
|
|
4445
|
-
|
4446
|
-
model.output
|
4447
|
-
|
4448
|
-
|
4880
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4881
|
+
if (!model.output) {
|
4882
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
4883
|
+
ml.n_created--; // artificial tensor
|
4884
|
+
ml.size_data += ggml_nbytes(model.output);
|
4885
|
+
}
|
4449
4886
|
}
|
4450
4887
|
|
4451
4888
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -4472,6 +4909,12 @@ static bool llm_load_tensors(
|
|
4472
4909
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4473
4910
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
4474
4911
|
|
4912
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
4913
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
4914
|
+
|
4915
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
4916
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
4917
|
+
|
4475
4918
|
// AWQ ScaleActivation layer
|
4476
4919
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
4477
4920
|
}
|
@@ -4918,6 +5361,59 @@ static bool llm_load_tensors(
|
|
4918
5361
|
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
4919
5362
|
}
|
4920
5363
|
} break;
|
5364
|
+
case LLM_ARCH_XVERSE:
|
5365
|
+
{
|
5366
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5367
|
+
{
|
5368
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5369
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5370
|
+
}
|
5371
|
+
for (int i = 0; i < n_layer; ++i) {
|
5372
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5373
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5374
|
+
auto & layer = model.layers[i];
|
5375
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5376
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5377
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5378
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5379
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5380
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5381
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5382
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5383
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5384
|
+
}
|
5385
|
+
} break;
|
5386
|
+
case LLM_ARCH_COMMAND_R:
|
5387
|
+
{
|
5388
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5389
|
+
|
5390
|
+
// output
|
5391
|
+
{
|
5392
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5393
|
+
// init output from the input tok embed
|
5394
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5395
|
+
ml.n_created--; // artificial tensor
|
5396
|
+
ml.size_data += ggml_nbytes(model.output);
|
5397
|
+
}
|
5398
|
+
|
5399
|
+
for (int i = 0; i < n_layer; ++i) {
|
5400
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5401
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5402
|
+
|
5403
|
+
auto & layer = model.layers[i];
|
5404
|
+
|
5405
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5406
|
+
|
5407
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5408
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5409
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5410
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5411
|
+
|
5412
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5413
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5414
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5415
|
+
}
|
5416
|
+
} break;
|
4921
5417
|
default:
|
4922
5418
|
throw std::runtime_error("unknown architecture");
|
4923
5419
|
}
|
@@ -4925,49 +5421,97 @@ static bool llm_load_tensors(
|
|
4925
5421
|
|
4926
5422
|
ml.done_getting_tensors();
|
4927
5423
|
|
4928
|
-
ml.
|
5424
|
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
5425
|
+
model.mappings.reserve(ml.mappings.size());
|
4929
5426
|
|
4930
5427
|
// create the backend buffers
|
4931
|
-
std::vector<std::pair<ggml_context *,
|
5428
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
5429
|
+
ctx_bufs.reserve(ctx_map.size());
|
5430
|
+
|
5431
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
5432
|
+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
5433
|
+
model.bufs.reserve(n_max_backend_buffer);
|
4932
5434
|
|
4933
5435
|
for (auto & it : ctx_map) {
|
4934
5436
|
ggml_backend_buffer_type_t buft = it.first;
|
4935
|
-
ggml_context * ctx
|
4936
|
-
|
5437
|
+
ggml_context * ctx = it.second;
|
5438
|
+
|
5439
|
+
llama_buf_map bufs;
|
5440
|
+
bufs.reserve(n_max_backend_buffer);
|
4937
5441
|
|
4938
5442
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
4939
5443
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
4940
5444
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
4941
|
-
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
4942
|
-
|
4943
|
-
|
4944
|
-
|
5445
|
+
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
5446
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5447
|
+
void * addr = nullptr;
|
5448
|
+
size_t first, last;
|
5449
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5450
|
+
if (first >= last) {
|
5451
|
+
continue;
|
5452
|
+
}
|
5453
|
+
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
|
5454
|
+
if (buf == nullptr) {
|
5455
|
+
throw std::runtime_error("unable to allocate backend CPU buffer");
|
5456
|
+
}
|
5457
|
+
model.bufs.push_back(buf);
|
5458
|
+
bufs.emplace(idx, buf);
|
5459
|
+
#ifdef GGML_USE_CUDA
|
5460
|
+
if (n_layer >= n_gpu_layers) {
|
5461
|
+
ggml_backend_cuda_register_host_buffer(
|
5462
|
+
ggml_backend_buffer_get_base(buf),
|
5463
|
+
ggml_backend_buffer_get_size(buf));
|
5464
|
+
}
|
5465
|
+
#endif
|
5466
|
+
}
|
4945
5467
|
}
|
4946
5468
|
#ifdef GGML_USE_METAL
|
4947
|
-
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
4948
|
-
|
4949
|
-
|
4950
|
-
|
4951
|
-
|
5469
|
+
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
5470
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5471
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
5472
|
+
void * addr = nullptr;
|
5473
|
+
size_t first, last;
|
5474
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5475
|
+
if (first >= last) {
|
5476
|
+
continue;
|
5477
|
+
}
|
5478
|
+
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
|
5479
|
+
if (buf == nullptr) {
|
5480
|
+
throw std::runtime_error("unable to allocate backend metal buffer");
|
5481
|
+
}
|
5482
|
+
model.bufs.push_back(buf);
|
5483
|
+
bufs.emplace(idx, buf);
|
5484
|
+
}
|
4952
5485
|
}
|
4953
5486
|
#endif
|
4954
5487
|
else {
|
4955
|
-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
4956
|
-
if (buf
|
5488
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5489
|
+
if (buf == nullptr) {
|
5490
|
+
throw std::runtime_error("unable to allocate backend buffer");
|
5491
|
+
}
|
5492
|
+
model.bufs.push_back(buf);
|
5493
|
+
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
4957
5494
|
model.mlock_bufs.emplace_back(new llama_mlock);
|
4958
5495
|
auto & mlock_buf = model.mlock_bufs.back();
|
4959
5496
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
4960
5497
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
4961
5498
|
}
|
5499
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5500
|
+
bufs.emplace(idx, buf);
|
5501
|
+
}
|
4962
5502
|
}
|
4963
|
-
|
5503
|
+
|
5504
|
+
if (bufs.empty()) {
|
4964
5505
|
throw std::runtime_error("failed to allocate buffer");
|
4965
5506
|
}
|
4966
|
-
|
4967
|
-
|
4968
|
-
|
4969
|
-
|
4970
|
-
|
5507
|
+
|
5508
|
+
for (auto & buf : bufs) {
|
5509
|
+
// indicate that this buffer contains weights
|
5510
|
+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
5511
|
+
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
5512
|
+
}
|
5513
|
+
|
5514
|
+
ctx_bufs.emplace_back(ctx, bufs);
|
4971
5515
|
}
|
4972
5516
|
|
4973
5517
|
if (llama_supports_gpu_offload()) {
|
@@ -4999,13 +5543,17 @@ static bool llm_load_tensors(
|
|
4999
5543
|
// load tensor data
|
5000
5544
|
for (auto & it : ctx_bufs) {
|
5001
5545
|
ggml_context * ctx = it.first;
|
5002
|
-
|
5003
|
-
if (!ml.load_all_data(ctx,
|
5546
|
+
auto & bufs = it.second;
|
5547
|
+
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
5004
5548
|
return false;
|
5005
5549
|
}
|
5006
5550
|
}
|
5007
5551
|
|
5008
|
-
|
5552
|
+
if (use_mmap_buffer) {
|
5553
|
+
for (auto & mapping : ml.mappings) {
|
5554
|
+
model.mappings.emplace_back(std::move(mapping));
|
5555
|
+
}
|
5556
|
+
}
|
5009
5557
|
|
5010
5558
|
// loading time will be recalculate after the first eval, so
|
5011
5559
|
// we take page faults deferred by mmap() into consideration
|
@@ -5064,6 +5612,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
5064
5612
|
}
|
5065
5613
|
#endif
|
5066
5614
|
|
5615
|
+
#ifdef GGML_USE_SYCL
|
5616
|
+
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
5617
|
+
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
5618
|
+
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
5619
|
+
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
5620
|
+
} else {
|
5621
|
+
ggml_backend_sycl_set_mul_device_mode();
|
5622
|
+
}
|
5623
|
+
#endif
|
5624
|
+
|
5067
5625
|
if (!llm_load_tensors(
|
5068
5626
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
5069
5627
|
params.progress_callback, params.progress_callback_user_data
|
@@ -5150,8 +5708,8 @@ static void llm_build_kv_store(
|
|
5150
5708
|
GGML_ASSERT(kv.size == n_ctx);
|
5151
5709
|
|
5152
5710
|
// compute the transposed [n_tokens, n_embd] V matrix
|
5153
|
-
|
5154
|
-
|
5711
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
5712
|
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
5155
5713
|
cb(v_cur_t, "v_cur_t", il);
|
5156
5714
|
|
5157
5715
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
@@ -5335,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
5335
5893
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5336
5894
|
}
|
5337
5895
|
|
5896
|
+
if (model.arch == LLM_ARCH_GROK) {
|
5897
|
+
// need to do the following:
|
5898
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
5899
|
+
// and then :
|
5900
|
+
// kq = 30 * tanh(kq / 30)
|
5901
|
+
// before the softmax below
|
5902
|
+
|
5903
|
+
//try from phi2
|
5904
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5905
|
+
|
5906
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
5907
|
+
kq = ggml_scale(ctx, kq, 30);
|
5908
|
+
}
|
5909
|
+
|
5338
5910
|
#if defined(GGML_USE_KOMPUTE)
|
5339
5911
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
5340
5912
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
@@ -5461,7 +6033,8 @@ struct llm_build_context {
|
|
5461
6033
|
const float norm_rms_eps;
|
5462
6034
|
|
5463
6035
|
const int32_t n_tokens;
|
5464
|
-
const int32_t n_kv; // size of KV cache to consider (n_kv <=
|
6036
|
+
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
6037
|
+
const int32_t n_outputs;
|
5465
6038
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5466
6039
|
const int32_t n_orig_ctx;
|
5467
6040
|
|
@@ -5508,6 +6081,7 @@ struct llm_build_context {
|
|
5508
6081
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
5509
6082
|
n_tokens (batch.n_tokens),
|
5510
6083
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
6084
|
+
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
5511
6085
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
5512
6086
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5513
6087
|
pooling_type (cparams.pooling_type),
|
@@ -5529,6 +6103,7 @@ struct llm_build_context {
|
|
5529
6103
|
lctx.inp_tokens = nullptr;
|
5530
6104
|
lctx.inp_embd = nullptr;
|
5531
6105
|
lctx.inp_pos = nullptr;
|
6106
|
+
lctx.inp_out_ids = nullptr;
|
5532
6107
|
lctx.inp_KQ_mask = nullptr;
|
5533
6108
|
lctx.inp_KQ_pos = nullptr;
|
5534
6109
|
lctx.inp_K_shift = nullptr;
|
@@ -5652,6 +6227,13 @@ struct llm_build_context {
|
|
5652
6227
|
return lctx.inp_pos;
|
5653
6228
|
}
|
5654
6229
|
|
6230
|
+
struct ggml_tensor * build_inp_out_ids() {
|
6231
|
+
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
6232
|
+
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
6233
|
+
ggml_set_input(lctx.inp_out_ids);
|
6234
|
+
return lctx.inp_out_ids;
|
6235
|
+
}
|
6236
|
+
|
5655
6237
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
5656
6238
|
if (causal) {
|
5657
6239
|
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
|
@@ -5708,6 +6290,9 @@ struct llm_build_context {
|
|
5708
6290
|
struct ggml_cgraph * build_llama() {
|
5709
6291
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5710
6292
|
|
6293
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6294
|
+
int32_t n_tokens = this->n_tokens;
|
6295
|
+
|
5711
6296
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5712
6297
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5713
6298
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
@@ -5775,6 +6360,14 @@ struct llm_build_context {
|
|
5775
6360
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5776
6361
|
}
|
5777
6362
|
|
6363
|
+
if (il == n_layer - 1) {
|
6364
|
+
// skip computing output for unused tokens
|
6365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6366
|
+
n_tokens = n_outputs;
|
6367
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6368
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6369
|
+
}
|
6370
|
+
|
5778
6371
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5779
6372
|
cb(ffn_inp, "ffn_inp", il);
|
5780
6373
|
|
@@ -5827,19 +6420,19 @@ struct llm_build_context {
|
|
5827
6420
|
for (int i = 0; i < n_expert_used; ++i) {
|
5828
6421
|
ggml_tensor * cur_expert;
|
5829
6422
|
|
5830
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].
|
6423
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
5831
6424
|
cb(cur_up, "ffn_moe_up", il);
|
5832
6425
|
|
5833
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].
|
6426
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
5834
6427
|
cb(cur_gate, "ffn_moe_gate", il);
|
5835
6428
|
|
5836
6429
|
cur_gate = ggml_silu(ctx0, cur_gate);
|
5837
6430
|
cb(cur_gate, "ffn_moe_silu", il);
|
5838
6431
|
|
5839
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6432
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
5840
6433
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
5841
6434
|
|
5842
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].
|
6435
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
5843
6436
|
cb(cur_expert, "ffn_moe_down", il);
|
5844
6437
|
|
5845
6438
|
cur_expert = ggml_mul(ctx0, cur_expert,
|
@@ -5858,6 +6451,12 @@ struct llm_build_context {
|
|
5858
6451
|
}
|
5859
6452
|
|
5860
6453
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
6454
|
+
cb(cur, "ffn_out", il);
|
6455
|
+
|
6456
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
6457
|
+
if (layer_dir != nullptr) {
|
6458
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
6459
|
+
}
|
5861
6460
|
cb(cur, "l_out", il);
|
5862
6461
|
|
5863
6462
|
// input for next layer
|
@@ -5893,7 +6492,7 @@ struct llm_build_context {
|
|
5893
6492
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5894
6493
|
|
5895
6494
|
// inp_pos - contains the positions
|
5896
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
6495
|
+
struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
|
5897
6496
|
|
5898
6497
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5899
6498
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
@@ -5943,12 +6542,18 @@ struct llm_build_context {
|
|
5943
6542
|
cb(Qcur, "Qcur", il);
|
5944
6543
|
cb(Kcur, "Kcur", il);
|
5945
6544
|
|
5946
|
-
|
5947
6545
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5948
6546
|
model.layers[il].wo, NULL,
|
5949
6547
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5950
6548
|
}
|
5951
6549
|
|
6550
|
+
if (il == n_layer - 1) {
|
6551
|
+
// skip computing output for unused tokens
|
6552
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6553
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6554
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6555
|
+
}
|
6556
|
+
|
5952
6557
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5953
6558
|
cb(ffn_inp, "ffn_inp", il);
|
5954
6559
|
|
@@ -5991,6 +6596,111 @@ struct llm_build_context {
|
|
5991
6596
|
return gf;
|
5992
6597
|
}
|
5993
6598
|
|
6599
|
+
struct ggml_cgraph * build_xverse() {
|
6600
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6601
|
+
|
6602
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6603
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6604
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6605
|
+
|
6606
|
+
struct ggml_tensor * cur;
|
6607
|
+
struct ggml_tensor * inpL;
|
6608
|
+
|
6609
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6610
|
+
|
6611
|
+
// inp_pos - contains the positions
|
6612
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6613
|
+
|
6614
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6615
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6616
|
+
|
6617
|
+
// positions of the tokens in the KV cache
|
6618
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6619
|
+
|
6620
|
+
for (int il = 0; il < n_layer; ++il) {
|
6621
|
+
struct ggml_tensor * inpSA = inpL;
|
6622
|
+
|
6623
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6624
|
+
model.layers[il].attn_norm, NULL,
|
6625
|
+
LLM_NORM_RMS, cb, il);
|
6626
|
+
cb(cur, "attn_norm", il);
|
6627
|
+
|
6628
|
+
// self-attention
|
6629
|
+
{
|
6630
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6631
|
+
cb(Qcur, "Qcur", il);
|
6632
|
+
|
6633
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6634
|
+
cb(Kcur, "Kcur", il);
|
6635
|
+
|
6636
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6637
|
+
cb(Vcur, "Vcur", il);
|
6638
|
+
|
6639
|
+
Qcur = ggml_rope_custom(
|
6640
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6641
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6642
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6643
|
+
);
|
6644
|
+
cb(Qcur, "Qcur", il);
|
6645
|
+
|
6646
|
+
Kcur = ggml_rope_custom(
|
6647
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6648
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6649
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6650
|
+
);
|
6651
|
+
cb(Kcur, "Kcur", il);
|
6652
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6653
|
+
model.layers[il].wo, NULL,
|
6654
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6655
|
+
}
|
6656
|
+
|
6657
|
+
if (il == n_layer - 1) {
|
6658
|
+
// skip computing output for unused tokens
|
6659
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6660
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6661
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6662
|
+
}
|
6663
|
+
|
6664
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6665
|
+
cb(ffn_inp, "ffn_inp", il);
|
6666
|
+
|
6667
|
+
// feed-forward network
|
6668
|
+
{
|
6669
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6670
|
+
model.layers[il].ffn_norm, NULL,
|
6671
|
+
LLM_NORM_RMS, cb, il);
|
6672
|
+
cb(cur, "ffn_norm", il);
|
6673
|
+
|
6674
|
+
cur = llm_build_ffn(ctx0, cur,
|
6675
|
+
model.layers[il].ffn_up, NULL,
|
6676
|
+
model.layers[il].ffn_gate, NULL,
|
6677
|
+
model.layers[il].ffn_down, NULL,
|
6678
|
+
NULL,
|
6679
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6680
|
+
cb(cur, "ffn_out", il);
|
6681
|
+
}
|
6682
|
+
|
6683
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6684
|
+
cb(cur, "l_out", il);
|
6685
|
+
|
6686
|
+
// input for next layer
|
6687
|
+
inpL = cur;
|
6688
|
+
}
|
6689
|
+
|
6690
|
+
cur = inpL;
|
6691
|
+
|
6692
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
6693
|
+
cb(cur, "result_norm", -1);
|
6694
|
+
|
6695
|
+
// lm_head
|
6696
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6697
|
+
cb(cur, "result_output", -1);
|
6698
|
+
|
6699
|
+
ggml_build_forward_expand(gf, cur);
|
6700
|
+
|
6701
|
+
return gf;
|
6702
|
+
}
|
6703
|
+
|
5994
6704
|
struct ggml_cgraph * build_falcon() {
|
5995
6705
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5996
6706
|
|
@@ -6064,6 +6774,14 @@ struct llm_build_context {
|
|
6064
6774
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6065
6775
|
}
|
6066
6776
|
|
6777
|
+
if (il == n_layer - 1) {
|
6778
|
+
// skip computing output for unused tokens
|
6779
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6780
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6781
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6782
|
+
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
6783
|
+
}
|
6784
|
+
|
6067
6785
|
struct ggml_tensor * ffn_inp = cur;
|
6068
6786
|
|
6069
6787
|
// feed forward
|
@@ -6104,6 +6822,214 @@ struct llm_build_context {
|
|
6104
6822
|
return gf;
|
6105
6823
|
}
|
6106
6824
|
|
6825
|
+
struct ggml_cgraph * build_grok() {
|
6826
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6827
|
+
|
6828
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6829
|
+
int32_t n_tokens = this->n_tokens;
|
6830
|
+
|
6831
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6832
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6833
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6834
|
+
|
6835
|
+
struct ggml_tensor * cur;
|
6836
|
+
struct ggml_tensor * inpL;
|
6837
|
+
|
6838
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6839
|
+
|
6840
|
+
// multiply by embedding_multiplier_scale of 78.38367176906169
|
6841
|
+
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
|
6842
|
+
|
6843
|
+
// inp_pos - contains the positions
|
6844
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6845
|
+
|
6846
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6847
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6848
|
+
|
6849
|
+
for (int il = 0; il < n_layer; ++il) {
|
6850
|
+
struct ggml_tensor * inpSA = inpL;
|
6851
|
+
|
6852
|
+
// norm
|
6853
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6854
|
+
model.layers[il].attn_norm, NULL,
|
6855
|
+
LLM_NORM_RMS, cb, il);
|
6856
|
+
cb(cur, "attn_norm", il);
|
6857
|
+
|
6858
|
+
|
6859
|
+
// self-attention
|
6860
|
+
{
|
6861
|
+
// compute Q and K and RoPE them
|
6862
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6863
|
+
cb(Qcur, "Qcur", il);
|
6864
|
+
if (model.layers[il].bq) {
|
6865
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6866
|
+
cb(Qcur, "Qcur", il);
|
6867
|
+
}
|
6868
|
+
|
6869
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6870
|
+
cb(Kcur, "Kcur", il);
|
6871
|
+
if (model.layers[il].bk) {
|
6872
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6873
|
+
cb(Kcur, "Kcur", il);
|
6874
|
+
}
|
6875
|
+
|
6876
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6877
|
+
cb(Vcur, "Vcur", il);
|
6878
|
+
if (model.layers[il].bv) {
|
6879
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6880
|
+
cb(Vcur, "Vcur", il);
|
6881
|
+
}
|
6882
|
+
|
6883
|
+
Qcur = ggml_rope_custom(
|
6884
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6885
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6886
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6887
|
+
);
|
6888
|
+
cb(Qcur, "Qcur", il);
|
6889
|
+
|
6890
|
+
Kcur = ggml_rope_custom(
|
6891
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6892
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6893
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6894
|
+
);
|
6895
|
+
cb(Kcur, "Kcur", il);
|
6896
|
+
|
6897
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6898
|
+
model.layers[il].wo, model.layers[il].bo,
|
6899
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6900
|
+
}
|
6901
|
+
|
6902
|
+
if (il == n_layer - 1) {
|
6903
|
+
// skip computing output for unused tokens
|
6904
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6905
|
+
n_tokens = n_outputs;
|
6906
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6907
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6908
|
+
}
|
6909
|
+
|
6910
|
+
// Grok
|
6911
|
+
// if attn_out_norm is present then apply it before adding the input
|
6912
|
+
if (model.layers[il].attn_out_norm) {
|
6913
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6914
|
+
model.layers[il].attn_out_norm, NULL,
|
6915
|
+
LLM_NORM_RMS, cb, il);
|
6916
|
+
cb(cur, "attn_out_norm", il);
|
6917
|
+
}
|
6918
|
+
|
6919
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6920
|
+
cb(ffn_inp, "ffn_inp", il);
|
6921
|
+
|
6922
|
+
// feed-forward network
|
6923
|
+
// MoE branch
|
6924
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6925
|
+
model.layers[il].ffn_norm, NULL,
|
6926
|
+
LLM_NORM_RMS, cb, il);
|
6927
|
+
cb(cur, "ffn_norm", il);
|
6928
|
+
|
6929
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
6930
|
+
cb(logits, "ffn_moe_logits", il);
|
6931
|
+
|
6932
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
6933
|
+
cb(probs, "ffn_moe_probs", il);
|
6934
|
+
|
6935
|
+
// select experts
|
6936
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
6937
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6938
|
+
|
6939
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6940
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6941
|
+
cb(weights, "ffn_moe_weights", il);
|
6942
|
+
|
6943
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6944
|
+
|
6945
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6946
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6947
|
+
|
6948
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6949
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6950
|
+
|
6951
|
+
// compute expert outputs
|
6952
|
+
ggml_tensor * moe_out = nullptr;
|
6953
|
+
|
6954
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6955
|
+
ggml_tensor * cur_expert;
|
6956
|
+
|
6957
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6958
|
+
cb(cur_up, "ffn_moe_up", il);
|
6959
|
+
|
6960
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6961
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
6962
|
+
|
6963
|
+
//GeLU
|
6964
|
+
cur_gate = ggml_gelu(ctx0, cur_gate);
|
6965
|
+
cb(cur_gate, "ffn_moe_gelu", il);
|
6966
|
+
|
6967
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6968
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
6969
|
+
|
6970
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6971
|
+
cb(cur_expert, "ffn_moe_down", il);
|
6972
|
+
|
6973
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
6974
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6975
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
6976
|
+
|
6977
|
+
if (i == 0) {
|
6978
|
+
moe_out = cur_expert;
|
6979
|
+
} else {
|
6980
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6981
|
+
cb(moe_out, "ffn_moe_out", il);
|
6982
|
+
}
|
6983
|
+
}
|
6984
|
+
|
6985
|
+
cur = moe_out;
|
6986
|
+
|
6987
|
+
// Grok
|
6988
|
+
// if layer_out_norm is present then apply it before adding the input
|
6989
|
+
// Idea: maybe ffn_out_norm is a better name
|
6990
|
+
if (model.layers[il].layer_out_norm) {
|
6991
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6992
|
+
model.layers[il].layer_out_norm, NULL,
|
6993
|
+
LLM_NORM_RMS, cb, il);
|
6994
|
+
cb(cur, "layer_out_norm", il);
|
6995
|
+
}
|
6996
|
+
|
6997
|
+
|
6998
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6999
|
+
cb(cur, "ffn_out", il);
|
7000
|
+
|
7001
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7002
|
+
if (layer_dir != nullptr) {
|
7003
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7004
|
+
}
|
7005
|
+
cb(cur, "l_out", il);
|
7006
|
+
|
7007
|
+
// input for next layer
|
7008
|
+
inpL = cur;
|
7009
|
+
}
|
7010
|
+
|
7011
|
+
cur = inpL;
|
7012
|
+
|
7013
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7014
|
+
model.output_norm, NULL,
|
7015
|
+
LLM_NORM_RMS, cb, -1);
|
7016
|
+
cb(cur, "result_norm", -1);
|
7017
|
+
|
7018
|
+
// lm_head
|
7019
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7020
|
+
|
7021
|
+
// Grok
|
7022
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7023
|
+
|
7024
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7025
|
+
|
7026
|
+
cb(cur, "result_output", -1);
|
7027
|
+
|
7028
|
+
ggml_build_forward_expand(gf, cur);
|
7029
|
+
|
7030
|
+
return gf;
|
7031
|
+
}
|
7032
|
+
|
6107
7033
|
struct ggml_cgraph * build_starcoder() {
|
6108
7034
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6109
7035
|
|
@@ -6158,6 +7084,13 @@ struct llm_build_context {
|
|
6158
7084
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6159
7085
|
}
|
6160
7086
|
|
7087
|
+
if (il == n_layer - 1) {
|
7088
|
+
// skip computing output for unused tokens
|
7089
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7090
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7091
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7092
|
+
}
|
7093
|
+
|
6161
7094
|
// add the input
|
6162
7095
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6163
7096
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6355,6 +7288,13 @@ struct llm_build_context {
|
|
6355
7288
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6356
7289
|
}
|
6357
7290
|
|
7291
|
+
if (il == n_layer - 1) {
|
7292
|
+
// skip computing output for unused tokens
|
7293
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7294
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7295
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
7296
|
+
}
|
7297
|
+
|
6358
7298
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
6359
7299
|
cb(ffn_inp, "ffn_inp", il);
|
6360
7300
|
|
@@ -6444,6 +7384,13 @@ struct llm_build_context {
|
|
6444
7384
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6445
7385
|
}
|
6446
7386
|
|
7387
|
+
if (il == n_layer - 1) {
|
7388
|
+
// skip computing output for unused tokens
|
7389
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7390
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7391
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7392
|
+
}
|
7393
|
+
|
6447
7394
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6448
7395
|
cb(ffn_inp, "ffn_inp", il);
|
6449
7396
|
|
@@ -6601,6 +7548,13 @@ struct llm_build_context {
|
|
6601
7548
|
}
|
6602
7549
|
cb(cur, "kqv_out", il);
|
6603
7550
|
|
7551
|
+
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
7552
|
+
// skip computing output for unused tokens
|
7553
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7554
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7555
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7556
|
+
}
|
7557
|
+
|
6604
7558
|
// re-add the layer input
|
6605
7559
|
cur = ggml_add(ctx0, cur, inpL);
|
6606
7560
|
|
@@ -6723,6 +7677,13 @@ struct llm_build_context {
|
|
6723
7677
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6724
7678
|
}
|
6725
7679
|
|
7680
|
+
if (il == n_layer - 1) {
|
7681
|
+
// skip computing output for unused tokens
|
7682
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7683
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7684
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7685
|
+
}
|
7686
|
+
|
6726
7687
|
// Add the input
|
6727
7688
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6728
7689
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6770,6 +7731,7 @@ struct llm_build_context {
|
|
6770
7731
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6771
7732
|
|
6772
7733
|
struct ggml_tensor * cur;
|
7734
|
+
struct ggml_tensor * pos;
|
6773
7735
|
struct ggml_tensor * inpL;
|
6774
7736
|
|
6775
7737
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -6780,6 +7742,16 @@ struct llm_build_context {
|
|
6780
7742
|
// positions of the tokens in the KV cache
|
6781
7743
|
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6782
7744
|
|
7745
|
+
if (model.pos_embd) {
|
7746
|
+
// inp_pos - contains the positions
|
7747
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7748
|
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7749
|
+
cb(pos, "pos_embd", -1);
|
7750
|
+
|
7751
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7752
|
+
cb(inpL, "inpL", -1);
|
7753
|
+
}
|
7754
|
+
|
6783
7755
|
for (int il = 0; il < n_layer; ++il) {
|
6784
7756
|
struct ggml_tensor * attn_norm;
|
6785
7757
|
|
@@ -6814,11 +7786,39 @@ struct llm_build_context {
|
|
6814
7786
|
cb(Kcur, "Kcur", il);
|
6815
7787
|
cb(Vcur, "Vcur", il);
|
6816
7788
|
|
6817
|
-
|
7789
|
+
// Q/K Layernorm
|
7790
|
+
if (model.layers[il].attn_q_norm) {
|
7791
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
7792
|
+
model.layers[il].attn_q_norm,
|
7793
|
+
model.layers[il].attn_q_norm_b,
|
7794
|
+
LLM_NORM, cb, il);
|
7795
|
+
cb(Qcur, "Qcur", il);
|
6818
7796
|
|
6819
|
-
|
7797
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
7798
|
+
model.layers[il].attn_k_norm,
|
7799
|
+
model.layers[il].attn_k_norm_b,
|
7800
|
+
LLM_NORM, cb, il);
|
7801
|
+
cb(Kcur, "Kcur", il);
|
7802
|
+
|
7803
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7804
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7805
|
+
|
7806
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6820
7807
|
model.layers[il].wo, model.layers[il].bo,
|
6821
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7808
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7809
|
+
} else {
|
7810
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7811
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7812
|
+
model.layers[il].wo, model.layers[il].bo,
|
7813
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7814
|
+
}
|
7815
|
+
}
|
7816
|
+
|
7817
|
+
if (il == n_layer - 1) {
|
7818
|
+
// skip computing output for unused tokens
|
7819
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7820
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7821
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6822
7822
|
}
|
6823
7823
|
|
6824
7824
|
// Add the input
|
@@ -6934,6 +7934,13 @@ struct llm_build_context {
|
|
6934
7934
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6935
7935
|
}
|
6936
7936
|
|
7937
|
+
if (il == n_layer - 1) {
|
7938
|
+
// skip computing output for unused tokens
|
7939
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7940
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7941
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7942
|
+
}
|
7943
|
+
|
6937
7944
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6938
7945
|
cb(ffn_inp, "ffn_inp", il);
|
6939
7946
|
|
@@ -7040,6 +8047,13 @@ struct llm_build_context {
|
|
7040
8047
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7041
8048
|
}
|
7042
8049
|
|
8050
|
+
if (il == n_layer - 1) {
|
8051
|
+
// skip computing output for unused tokens
|
8052
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8053
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8054
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8055
|
+
}
|
8056
|
+
|
7043
8057
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7044
8058
|
cb(ffn_inp, "ffn_inp", il);
|
7045
8059
|
|
@@ -7152,6 +8166,13 @@ struct llm_build_context {
|
|
7152
8166
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7153
8167
|
}
|
7154
8168
|
|
8169
|
+
if (il == n_layer - 1) {
|
8170
|
+
// skip computing output for unused tokens
|
8171
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8172
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8173
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8174
|
+
}
|
8175
|
+
|
7155
8176
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7156
8177
|
cb(ffn_inp, "ffn_inp", il);
|
7157
8178
|
|
@@ -7270,6 +8291,14 @@ struct llm_build_context {
|
|
7270
8291
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7271
8292
|
}
|
7272
8293
|
|
8294
|
+
if (il == n_layer - 1) {
|
8295
|
+
// skip computing output for unused tokens
|
8296
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8297
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8298
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8299
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
8300
|
+
}
|
8301
|
+
|
7273
8302
|
// FF
|
7274
8303
|
{
|
7275
8304
|
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
@@ -7367,6 +8396,14 @@ struct llm_build_context {
|
|
7367
8396
|
|
7368
8397
|
cur = attention_norm;
|
7369
8398
|
|
8399
|
+
if (il == n_layer - 1) {
|
8400
|
+
// skip computing output for unused tokens
|
8401
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8402
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8403
|
+
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
|
8404
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8405
|
+
}
|
8406
|
+
|
7370
8407
|
// feed-forward network
|
7371
8408
|
{
|
7372
8409
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -7459,6 +8496,13 @@ struct llm_build_context {
|
|
7459
8496
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7460
8497
|
}
|
7461
8498
|
|
8499
|
+
if (il == n_layer - 1) {
|
8500
|
+
// skip computing output for unused tokens
|
8501
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8502
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8503
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8504
|
+
}
|
8505
|
+
|
7462
8506
|
// add the input
|
7463
8507
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7464
8508
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7559,6 +8603,13 @@ struct llm_build_context {
|
|
7559
8603
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7560
8604
|
}
|
7561
8605
|
|
8606
|
+
if (il == n_layer - 1) {
|
8607
|
+
// skip computing output for unused tokens
|
8608
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8609
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8610
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8611
|
+
}
|
8612
|
+
|
7562
8613
|
// add the input
|
7563
8614
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7564
8615
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7668,6 +8719,13 @@ struct llm_build_context {
|
|
7668
8719
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7669
8720
|
}
|
7670
8721
|
|
8722
|
+
if (il == n_layer - 1) {
|
8723
|
+
// skip computing output for unused tokens
|
8724
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8725
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8726
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8727
|
+
}
|
8728
|
+
|
7671
8729
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7672
8730
|
cb(ffn_inp, "ffn_inp", il);
|
7673
8731
|
|
@@ -7778,6 +8836,13 @@ struct llm_build_context {
|
|
7778
8836
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7779
8837
|
}
|
7780
8838
|
|
8839
|
+
if (il == n_layer - 1) {
|
8840
|
+
// skip computing output for unused tokens
|
8841
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8842
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8843
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8844
|
+
}
|
8845
|
+
|
7781
8846
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7782
8847
|
cb(ffn_inp, "ffn_inp", il);
|
7783
8848
|
|
@@ -7901,6 +8966,13 @@ struct llm_build_context {
|
|
7901
8966
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7902
8967
|
}
|
7903
8968
|
|
8969
|
+
if (il == n_layer - 1) {
|
8970
|
+
// skip computing output for unused tokens
|
8971
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8972
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8973
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8974
|
+
}
|
8975
|
+
|
7904
8976
|
// scale_res - scale the hidden states for residual connection
|
7905
8977
|
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
7906
8978
|
cur = ggml_scale(ctx0, cur, scale_res);
|
@@ -8015,6 +9087,13 @@ struct llm_build_context {
|
|
8015
9087
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8016
9088
|
}
|
8017
9089
|
|
9090
|
+
if (il == n_layer - 1) {
|
9091
|
+
// skip computing output for unused tokens
|
9092
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9093
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9094
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9095
|
+
}
|
9096
|
+
|
8018
9097
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
8019
9098
|
cb(sa_out, "sa_out", il);
|
8020
9099
|
|
@@ -8125,7 +9204,13 @@ struct llm_build_context {
|
|
8125
9204
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8126
9205
|
model.layers[il].wo, model.layers[il].bo,
|
8127
9206
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8128
|
-
|
9207
|
+
}
|
9208
|
+
|
9209
|
+
if (il == n_layer - 1) {
|
9210
|
+
// skip computing output for unused tokens
|
9211
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9212
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9213
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8129
9214
|
}
|
8130
9215
|
|
8131
9216
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -8275,6 +9360,15 @@ struct llm_build_context {
|
|
8275
9360
|
|
8276
9361
|
struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
|
8277
9362
|
|
9363
|
+
if (il == n_layer - 1) {
|
9364
|
+
// skip computing output for unused tokens
|
9365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9366
|
+
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
9367
|
+
y = ggml_get_rows(ctx0, y, inp_out_ids);
|
9368
|
+
z = ggml_get_rows(ctx0, z, inp_out_ids);
|
9369
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9370
|
+
}
|
9371
|
+
|
8278
9372
|
// {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
|
8279
9373
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
8280
9374
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
|
@@ -8305,6 +9399,129 @@ struct llm_build_context {
|
|
8305
9399
|
|
8306
9400
|
return gf;
|
8307
9401
|
}
|
9402
|
+
|
9403
|
+
struct ggml_cgraph * build_command_r() {
|
9404
|
+
|
9405
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
9406
|
+
|
9407
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9408
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
9409
|
+
const float f_logit_scale = hparams.f_logit_scale;
|
9410
|
+
|
9411
|
+
struct ggml_tensor * cur;
|
9412
|
+
struct ggml_tensor * inpL;
|
9413
|
+
|
9414
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
9415
|
+
|
9416
|
+
// inp_pos - contains the positions
|
9417
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
9418
|
+
|
9419
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
9420
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
9421
|
+
|
9422
|
+
for (int il = 0; il < n_layer; ++il) {
|
9423
|
+
|
9424
|
+
// norm
|
9425
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9426
|
+
model.layers[il].attn_norm, NULL,
|
9427
|
+
LLM_NORM, cb, il);
|
9428
|
+
cb(cur, "attn_norm", il);
|
9429
|
+
struct ggml_tensor * ffn_inp = cur;
|
9430
|
+
|
9431
|
+
// self-attention
|
9432
|
+
{
|
9433
|
+
// compute Q and K and RoPE them
|
9434
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
9435
|
+
cb(Qcur, "Qcur", il);
|
9436
|
+
if (model.layers[il].bq) {
|
9437
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
9438
|
+
cb(Qcur, "Qcur", il);
|
9439
|
+
}
|
9440
|
+
|
9441
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
9442
|
+
cb(Kcur, "Kcur", il);
|
9443
|
+
if (model.layers[il].bk) {
|
9444
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
9445
|
+
cb(Kcur, "Kcur", il);
|
9446
|
+
}
|
9447
|
+
|
9448
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9449
|
+
cb(Vcur, "Vcur", il);
|
9450
|
+
if (model.layers[il].bv) {
|
9451
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9452
|
+
cb(Vcur, "Vcur", il);
|
9453
|
+
}
|
9454
|
+
|
9455
|
+
Qcur = ggml_rope_custom(
|
9456
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9457
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9458
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
9459
|
+
);
|
9460
|
+
cb(Qcur, "Qcur", il);
|
9461
|
+
|
9462
|
+
Kcur = ggml_rope_custom(
|
9463
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9464
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9465
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
9466
|
+
);
|
9467
|
+
cb(Kcur, "Kcur", il);
|
9468
|
+
|
9469
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9470
|
+
model.layers[il].wo, model.layers[il].bo,
|
9471
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9472
|
+
}
|
9473
|
+
|
9474
|
+
if (il == n_layer - 1) {
|
9475
|
+
// skip computing output for unused tokens
|
9476
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9477
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9478
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9479
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
9480
|
+
}
|
9481
|
+
|
9482
|
+
struct ggml_tensor * attn_out = cur;
|
9483
|
+
|
9484
|
+
// feed-forward network
|
9485
|
+
{
|
9486
|
+
cur = llm_build_ffn(ctx0, ffn_inp,
|
9487
|
+
model.layers[il].ffn_up, NULL,
|
9488
|
+
model.layers[il].ffn_gate, NULL,
|
9489
|
+
model.layers[il].ffn_down, NULL,
|
9490
|
+
NULL,
|
9491
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
9492
|
+
cb(cur, "ffn_out", il);
|
9493
|
+
}
|
9494
|
+
|
9495
|
+
// add together residual + FFN + self-attention
|
9496
|
+
cur = ggml_add(ctx0, cur, inpL);
|
9497
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
9498
|
+
cb(cur, "l_out", il);
|
9499
|
+
|
9500
|
+
// input for next layer
|
9501
|
+
inpL = cur;
|
9502
|
+
}
|
9503
|
+
|
9504
|
+
cur = inpL;
|
9505
|
+
|
9506
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9507
|
+
model.output_norm, NULL,
|
9508
|
+
LLM_NORM, cb, -1);
|
9509
|
+
cb(cur, "result_norm", -1);
|
9510
|
+
|
9511
|
+
// lm_head
|
9512
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
9513
|
+
|
9514
|
+
if (f_logit_scale) {
|
9515
|
+
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
9516
|
+
}
|
9517
|
+
|
9518
|
+
cb(cur, "result_output", -1);
|
9519
|
+
|
9520
|
+
ggml_build_forward_expand(gf, cur);
|
9521
|
+
|
9522
|
+
return gf;
|
9523
|
+
|
9524
|
+
}
|
8308
9525
|
};
|
8309
9526
|
|
8310
9527
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -8380,12 +9597,15 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8380
9597
|
}
|
8381
9598
|
|
8382
9599
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
8383
|
-
//
|
8384
|
-
|
8385
|
-
|
8386
|
-
|
8387
|
-
|
8388
|
-
|
9600
|
+
// FIXME: fix in ggml_backend_sched
|
9601
|
+
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
9602
|
+
if (batch.n_tokens < 32 || full_offload) {
|
9603
|
+
if (il != -1 && strcmp(name, "norm") == 0) {
|
9604
|
+
for (auto * backend : lctx.backends) {
|
9605
|
+
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
9606
|
+
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
9607
|
+
break;
|
9608
|
+
}
|
8389
9609
|
}
|
8390
9610
|
}
|
8391
9611
|
}
|
@@ -8410,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8410
9630
|
{
|
8411
9631
|
result = llm.build_falcon();
|
8412
9632
|
} break;
|
9633
|
+
case LLM_ARCH_GROK:
|
9634
|
+
{
|
9635
|
+
result = llm.build_grok();
|
9636
|
+
} break;
|
8413
9637
|
case LLM_ARCH_STARCODER:
|
8414
9638
|
{
|
8415
9639
|
result = llm.build_starcoder();
|
@@ -8487,6 +9711,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8487
9711
|
{
|
8488
9712
|
result = llm.build_mamba();
|
8489
9713
|
} break;
|
9714
|
+
case LLM_ARCH_XVERSE:
|
9715
|
+
{
|
9716
|
+
result = llm.build_xverse();
|
9717
|
+
} break;
|
9718
|
+
case LLM_ARCH_COMMAND_R:
|
9719
|
+
{
|
9720
|
+
result = llm.build_command_r();
|
9721
|
+
} break;
|
8490
9722
|
default:
|
8491
9723
|
GGML_ASSERT(false);
|
8492
9724
|
}
|
@@ -8548,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8548
9780
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
8549
9781
|
}
|
8550
9782
|
|
9783
|
+
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
9784
|
+
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
9785
|
+
const int64_t n_tokens = batch.n_tokens;
|
9786
|
+
|
9787
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
9788
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
9789
|
+
|
9790
|
+
if (lctx.n_outputs == n_tokens) {
|
9791
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9792
|
+
data[i] = i;
|
9793
|
+
}
|
9794
|
+
} else if (batch.logits) {
|
9795
|
+
int32_t n_outputs = 0;
|
9796
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9797
|
+
if (batch.logits[i]) {
|
9798
|
+
data[n_outputs++] = i;
|
9799
|
+
}
|
9800
|
+
}
|
9801
|
+
// the graph needs to have been passed the correct number of outputs
|
9802
|
+
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
9803
|
+
} else if (lctx.n_outputs == 1) {
|
9804
|
+
// only keep last output
|
9805
|
+
data[0] = n_tokens - 1;
|
9806
|
+
} else {
|
9807
|
+
GGML_ASSERT(lctx.n_outputs == 0);
|
9808
|
+
}
|
9809
|
+
}
|
9810
|
+
|
8551
9811
|
GGML_ASSERT(
|
9812
|
+
// (!a || b) is a logical implication (a -> b)
|
9813
|
+
// !hparams.causal_attn -> !cparams.causal_attn
|
8552
9814
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
8553
|
-
"
|
9815
|
+
"causal attention with embedding models is not supported"
|
8554
9816
|
);
|
8555
9817
|
|
8556
9818
|
if (lctx.inp_KQ_mask) {
|
@@ -8729,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8729
9991
|
}
|
8730
9992
|
}
|
8731
9993
|
|
9994
|
+
// Make sure enough space is available for outputs.
|
9995
|
+
// Returns max number of outputs for which space was reserved.
|
9996
|
+
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
9997
|
+
const auto & cparams = lctx.cparams;
|
9998
|
+
const auto & hparams = lctx.model.hparams;
|
9999
|
+
|
10000
|
+
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
10001
|
+
|
10002
|
+
const auto n_batch = cparams.n_batch;
|
10003
|
+
const auto n_vocab = hparams.n_vocab;
|
10004
|
+
const auto n_embd = hparams.n_embd;
|
10005
|
+
|
10006
|
+
// TODO: use a per-batch flag for logits presence instead
|
10007
|
+
const bool has_logits = cparams.causal_attn;
|
10008
|
+
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
10009
|
+
|
10010
|
+
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
10011
|
+
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
10012
|
+
|
10013
|
+
if (lctx.output_ids.empty()) {
|
10014
|
+
// init, never resized afterwards
|
10015
|
+
lctx.output_ids.resize(n_batch);
|
10016
|
+
}
|
10017
|
+
|
10018
|
+
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
|
10019
|
+
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
10020
|
+
|
10021
|
+
// alloc only when more than the current capacity is required
|
10022
|
+
// TODO: also consider shrinking the buffer
|
10023
|
+
if (!lctx.buf_output || prev_size < new_size) {
|
10024
|
+
if (lctx.buf_output) {
|
10025
|
+
#ifndef NDEBUG
|
10026
|
+
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
10027
|
+
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
10028
|
+
#endif
|
10029
|
+
ggml_backend_buffer_free(lctx.buf_output);
|
10030
|
+
lctx.buf_output = nullptr;
|
10031
|
+
lctx.logits = nullptr;
|
10032
|
+
lctx.embd = nullptr;
|
10033
|
+
}
|
10034
|
+
|
10035
|
+
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
10036
|
+
if (lctx.buf_output == nullptr) {
|
10037
|
+
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
10038
|
+
return 0;
|
10039
|
+
}
|
10040
|
+
}
|
10041
|
+
|
10042
|
+
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
|
10043
|
+
|
10044
|
+
lctx.logits = has_logits ? output_base : nullptr;
|
10045
|
+
lctx.embd = has_embd ? output_base + logits_size : nullptr;
|
10046
|
+
|
10047
|
+
lctx.output_size = n_outputs_max;
|
10048
|
+
lctx.logits_size = logits_size;
|
10049
|
+
lctx.embd_size = embd_size;
|
10050
|
+
|
10051
|
+
// set all ids as invalid (negative)
|
10052
|
+
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
|
10053
|
+
|
10054
|
+
ggml_backend_buffer_clear(lctx.buf_output, 0);
|
10055
|
+
|
10056
|
+
lctx.n_outputs = 0;
|
10057
|
+
|
10058
|
+
return n_outputs_max;
|
10059
|
+
}
|
10060
|
+
|
10061
|
+
|
8732
10062
|
static void llama_graph_compute(
|
8733
10063
|
llama_context & lctx,
|
8734
10064
|
ggml_cgraph * gf,
|
@@ -8804,16 +10134,8 @@ static int llama_decode_internal(
|
|
8804
10134
|
const int64_t n_embd = hparams.n_embd;
|
8805
10135
|
const int64_t n_vocab = hparams.n_vocab;
|
8806
10136
|
|
8807
|
-
|
8808
|
-
|
8809
|
-
|
8810
|
-
#ifndef NDEBUG
|
8811
|
-
auto & logits_valid = lctx.logits_valid;
|
8812
|
-
logits_valid.clear();
|
8813
|
-
logits_valid.resize(n_tokens_all);
|
8814
|
-
|
8815
|
-
memset(logits_out, 0, lctx.logits_size*sizeof(float));
|
8816
|
-
#endif
|
10137
|
+
uint32_t n_outputs = 0;
|
10138
|
+
uint32_t n_outputs_prev = 0;
|
8817
10139
|
|
8818
10140
|
const auto n_ubatch = cparams.n_ubatch;
|
8819
10141
|
|
@@ -8822,6 +10144,38 @@ static int llama_decode_internal(
|
|
8822
10144
|
std::vector<llama_seq_id *> seq_id_arr;
|
8823
10145
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
8824
10146
|
|
10147
|
+
// count outputs
|
10148
|
+
if (batch_all.logits) {
|
10149
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10150
|
+
n_outputs += batch_all.logits[i] != 0;
|
10151
|
+
}
|
10152
|
+
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
10153
|
+
n_outputs = n_tokens_all;
|
10154
|
+
} else {
|
10155
|
+
// keep last output only
|
10156
|
+
n_outputs = 1;
|
10157
|
+
}
|
10158
|
+
|
10159
|
+
// reserve output buffer
|
10160
|
+
if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
|
10161
|
+
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
|
10162
|
+
return -2;
|
10163
|
+
};
|
10164
|
+
|
10165
|
+
// set output mappings
|
10166
|
+
if (batch_all.logits) {
|
10167
|
+
int32_t i_logits = 0;
|
10168
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10169
|
+
if (batch_all.logits[i]) {
|
10170
|
+
lctx.output_ids[i] = i_logits++;
|
10171
|
+
}
|
10172
|
+
}
|
10173
|
+
} else {
|
10174
|
+
for (uint32_t i = 0; i < n_outputs; ++i) {
|
10175
|
+
lctx.output_ids[i] = i;
|
10176
|
+
}
|
10177
|
+
}
|
10178
|
+
|
8825
10179
|
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
8826
10180
|
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
8827
10181
|
llama_batch u_batch = {
|
@@ -8837,6 +10191,27 @@ static int llama_decode_internal(
|
|
8837
10191
|
/* .all_seq_id = */ batch_all.all_seq_id,
|
8838
10192
|
};
|
8839
10193
|
|
10194
|
+
// count the outputs in this u_batch
|
10195
|
+
{
|
10196
|
+
int32_t n_outputs_new = 0;
|
10197
|
+
|
10198
|
+
if (u_batch.logits) {
|
10199
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
10200
|
+
n_outputs_new += u_batch.logits[i] != 0;
|
10201
|
+
}
|
10202
|
+
} else if (n_outputs == n_tokens_all) {
|
10203
|
+
n_outputs_new = n_tokens;
|
10204
|
+
} else {
|
10205
|
+
// keep last output only
|
10206
|
+
if (cur_token + n_tokens >= n_tokens_all) {
|
10207
|
+
n_outputs_new = 1;
|
10208
|
+
}
|
10209
|
+
}
|
10210
|
+
|
10211
|
+
// needs to happen before the graph is built
|
10212
|
+
lctx.n_outputs = n_outputs_new;
|
10213
|
+
}
|
10214
|
+
|
8840
10215
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
8841
10216
|
GGML_ASSERT(n_threads > 0);
|
8842
10217
|
|
@@ -8900,23 +10275,37 @@ static int llama_decode_internal(
|
|
8900
10275
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
8901
10276
|
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
8902
10277
|
|
8903
|
-
if (
|
10278
|
+
if (lctx.n_outputs == 0) {
|
10279
|
+
// no output
|
10280
|
+
res = nullptr;
|
10281
|
+
embd = nullptr;
|
10282
|
+
} else if (!hparams.causal_attn) {
|
8904
10283
|
res = nullptr; // do not extract logits for embedding models such as BERT
|
8905
10284
|
|
8906
10285
|
// token or sequence embeddings
|
8907
10286
|
embd = gf->nodes[gf->n_nodes - 1];
|
8908
10287
|
|
8909
10288
|
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
8910
|
-
} else {
|
8911
|
-
|
8912
|
-
|
8913
|
-
|
8914
|
-
|
8915
|
-
|
8916
|
-
|
8917
|
-
}
|
8918
|
-
|
10289
|
+
} else if (cparams.embeddings) {
|
10290
|
+
// the embeddings could be in the second to last tensor, or any of the previous tensors
|
10291
|
+
int i_embd = gf->n_nodes - 2;
|
10292
|
+
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
|
10293
|
+
i_embd = gf->n_nodes - i;
|
10294
|
+
if (i_embd < 0) { break; }
|
10295
|
+
embd = gf->nodes[i_embd];
|
10296
|
+
}
|
10297
|
+
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
10298
|
+
|
10299
|
+
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
10300
|
+
if (!cparams.causal_attn) {
|
10301
|
+
res = nullptr; // do not extract logits when not needed
|
10302
|
+
// skip computing logits
|
10303
|
+
// TODO: is this safe?
|
10304
|
+
gf->n_nodes = i_embd + 1;
|
8919
10305
|
}
|
10306
|
+
} else {
|
10307
|
+
embd = nullptr; // do not extract embeddings when not needed
|
10308
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
8920
10309
|
}
|
8921
10310
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
8922
10311
|
|
@@ -8959,67 +10348,38 @@ static int llama_decode_internal(
|
|
8959
10348
|
//}
|
8960
10349
|
|
8961
10350
|
// extract logits
|
8962
|
-
// TODO: do not compute and extract logits if only embeddings are needed
|
8963
|
-
// update the graphs to skip "result_output" if logits are not needed
|
8964
10351
|
if (res) {
|
8965
10352
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
8966
10353
|
GGML_ASSERT(backend_res != nullptr);
|
8967
|
-
|
8968
|
-
|
8969
|
-
|
8970
|
-
|
8971
|
-
|
8972
|
-
|
8973
|
-
|
8974
|
-
|
8975
|
-
|
8976
|
-
// extract logits for the range [i_first, i_last)
|
8977
|
-
// group the requests to minimize the number of calls to the backend
|
8978
|
-
ggml_backend_tensor_get_async(backend_res, res,
|
8979
|
-
logits_out + n_vocab*(cur_token + i_first),
|
8980
|
-
i_first*n_vocab*sizeof(float),
|
8981
|
-
(i_last - i_first)*n_vocab*sizeof(float));
|
8982
|
-
i_first = -1;
|
8983
|
-
}
|
8984
|
-
}
|
8985
|
-
#ifndef NDEBUG
|
8986
|
-
logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
|
8987
|
-
#endif
|
8988
|
-
}
|
8989
|
-
} else if (lctx.logits_all) {
|
8990
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
|
8991
|
-
#ifndef NDEBUG
|
8992
|
-
std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
|
8993
|
-
#endif
|
8994
|
-
} else {
|
8995
|
-
if (cur_token + n_tokens >= n_tokens_all) {
|
8996
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
|
8997
|
-
#ifndef NDEBUG
|
8998
|
-
logits_valid[0] = true;
|
8999
|
-
#endif
|
9000
|
-
}
|
10354
|
+
GGML_ASSERT(lctx.logits != nullptr);
|
10355
|
+
|
10356
|
+
float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
|
10357
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10358
|
+
|
10359
|
+
if (n_outputs_new) {
|
10360
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10361
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
|
10362
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
|
9001
10363
|
}
|
9002
10364
|
}
|
9003
10365
|
|
9004
10366
|
// extract embeddings
|
9005
|
-
if (
|
10367
|
+
if (embd) {
|
9006
10368
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
9007
10369
|
GGML_ASSERT(backend_embd != nullptr);
|
9008
10370
|
|
9009
10371
|
switch (cparams.pooling_type) {
|
9010
10372
|
case LLAMA_POOLING_TYPE_NONE:
|
9011
|
-
{
|
9012
|
-
// extract token embeddings
|
9013
|
-
|
9014
|
-
|
9015
|
-
|
9016
|
-
|
9017
|
-
|
9018
|
-
|
9019
|
-
|
9020
|
-
|
9021
|
-
ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
9022
|
-
}
|
10373
|
+
{
|
10374
|
+
// extract token embeddings
|
10375
|
+
GGML_ASSERT(lctx.embd != nullptr);
|
10376
|
+
float * embd_out = lctx.embd + n_outputs_prev*n_embd;
|
10377
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10378
|
+
|
10379
|
+
if (n_outputs_new) {
|
10380
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10381
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
|
10382
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
9023
10383
|
}
|
9024
10384
|
} break;
|
9025
10385
|
case LLAMA_POOLING_TYPE_CLS:
|
@@ -9046,6 +10406,7 @@ static int llama_decode_internal(
|
|
9046
10406
|
} break;
|
9047
10407
|
}
|
9048
10408
|
}
|
10409
|
+
n_outputs_prev += lctx.n_outputs;
|
9049
10410
|
}
|
9050
10411
|
|
9051
10412
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
@@ -9976,7 +11337,7 @@ struct llm_tokenizer_wpm {
|
|
9976
11337
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
9977
11338
|
continue;
|
9978
11339
|
}
|
9979
|
-
code =
|
11340
|
+
code = unicode_tolower(code);
|
9980
11341
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
9981
11342
|
code = ' ';
|
9982
11343
|
}
|
@@ -9996,7 +11357,7 @@ struct llm_tokenizer_wpm {
|
|
9996
11357
|
std::vector<std::string> words;
|
9997
11358
|
while (r < new_str.size()) {
|
9998
11359
|
// if is whitespace
|
9999
|
-
if (isspace(new_str[r])) {
|
11360
|
+
if (isspace(new_str[r], std::locale::classic())) {
|
10000
11361
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
10001
11362
|
l = r + 1;
|
10002
11363
|
r = l;
|
@@ -10010,18 +11371,12 @@ struct llm_tokenizer_wpm {
|
|
10010
11371
|
return words;
|
10011
11372
|
}
|
10012
11373
|
|
10013
|
-
uint32_t to_lower(uint32_t code) {
|
10014
|
-
static const std::locale locale("en_US.UTF-8");
|
10015
|
-
#if defined(_WIN32)
|
10016
|
-
if (code > 0xFFFF) {
|
10017
|
-
return code;
|
10018
|
-
}
|
10019
|
-
#endif
|
10020
|
-
return std::tolower(wchar_t(code), locale);
|
10021
|
-
}
|
10022
|
-
|
10023
11374
|
bool is_ascii_punct(uint32_t code) {
|
10024
|
-
|
11375
|
+
if (code > 0xFF) {
|
11376
|
+
return false;
|
11377
|
+
}
|
11378
|
+
auto c = char(static_cast<unsigned char>(code));
|
11379
|
+
return ispunct(c, std::locale::classic());
|
10025
11380
|
}
|
10026
11381
|
|
10027
11382
|
bool is_chinese_char(uint32_t cpt) {
|
@@ -10266,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10266
11621
|
// grammar - internal
|
10267
11622
|
//
|
10268
11623
|
|
10269
|
-
struct llama_partial_utf8 {
|
10270
|
-
uint32_t value; // bit value so far (unshifted)
|
10271
|
-
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
10272
|
-
};
|
10273
|
-
|
10274
|
-
struct llama_grammar {
|
10275
|
-
const std::vector<std::vector<llama_grammar_element>> rules;
|
10276
|
-
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
10277
|
-
|
10278
|
-
// buffer for partially generated UTF-8 sequence from accepted tokens
|
10279
|
-
llama_partial_utf8 partial_utf8;
|
10280
|
-
};
|
10281
|
-
|
10282
|
-
struct llama_grammar_candidate {
|
10283
|
-
size_t index;
|
10284
|
-
const uint32_t * code_points;
|
10285
|
-
llama_partial_utf8 partial_utf8;
|
10286
|
-
};
|
10287
11624
|
|
10288
11625
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
10289
11626
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
10290
|
-
|
11627
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
10291
11628
|
const std::string & src,
|
10292
11629
|
llama_partial_utf8 partial_start) {
|
10293
11630
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -10489,7 +11826,7 @@ static void llama_grammar_advance_stack(
|
|
10489
11826
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
10490
11827
|
// produces the N possible stacks if the given char is accepted at those
|
10491
11828
|
// positions
|
10492
|
-
|
11829
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
10493
11830
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
10494
11831
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
10495
11832
|
const uint32_t chr) {
|
@@ -11715,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11715
13052
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
11716
13053
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
11717
13054
|
// tensor name.
|
11718
|
-
n_layer /= n_expert;
|
11719
13055
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
11720
13056
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
11721
13057
|
}
|
@@ -11729,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11729
13065
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
11730
13066
|
// with the quantization of the output tensor
|
11731
13067
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
11732
|
-
|
11733
|
-
|
11734
|
-
|
11735
|
-
|
11736
|
-
|
11737
|
-
|
11738
|
-
|
11739
|
-
|
11740
|
-
|
11741
|
-
|
13068
|
+
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
13069
|
+
new_type = qs.params->output_tensor_type;
|
13070
|
+
} else {
|
13071
|
+
int nx = tensor->ne[0];
|
13072
|
+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
13073
|
+
new_type = GGML_TYPE_Q8_0;
|
13074
|
+
}
|
13075
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
13076
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
13077
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13078
|
+
new_type = GGML_TYPE_Q5_K;
|
13079
|
+
}
|
13080
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
13081
|
+
new_type = GGML_TYPE_Q6_K;
|
13082
|
+
}
|
11742
13083
|
}
|
11743
13084
|
} else if (name == "token_embd.weight") {
|
11744
|
-
if (
|
11745
|
-
|
11746
|
-
|
11747
|
-
|
11748
|
-
|
11749
|
-
|
11750
|
-
|
11751
|
-
|
11752
|
-
|
13085
|
+
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
13086
|
+
new_type = qs.params->token_embedding_type;
|
13087
|
+
} else {
|
13088
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
13089
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13090
|
+
new_type = GGML_TYPE_Q2_K;
|
13091
|
+
}
|
13092
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13093
|
+
new_type = GGML_TYPE_IQ3_S;
|
13094
|
+
}
|
13095
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
13096
|
+
new_type = GGML_TYPE_IQ3_S;
|
13097
|
+
}
|
11753
13098
|
}
|
11754
13099
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
11755
|
-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13100
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
11756
13101
|
if (name.find("attn_v.weight") != std::string::npos) {
|
11757
13102
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
11758
13103
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
@@ -11771,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11771
13116
|
if (qs.model.hparams.n_expert == 8) {
|
11772
13117
|
new_type = GGML_TYPE_Q5_K;
|
11773
13118
|
} else {
|
11774
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
13119
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
|
11775
13120
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
11776
13121
|
}
|
11777
13122
|
}
|
@@ -11785,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11785
13130
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
11786
13131
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
11787
13132
|
}
|
11788
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
11789
|
-
new_type = GGML_TYPE_Q4_K;
|
11790
|
-
}
|
11791
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11792
|
-
new_type = GGML_TYPE_Q4_K;
|
11793
|
-
}
|
11794
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
13133
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
11795
13134
|
new_type = GGML_TYPE_Q4_K;
|
11796
13135
|
}
|
11797
13136
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
@@ -11944,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11944
13283
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
11945
13284
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
11946
13285
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
11947
|
-
new_type == GGML_TYPE_IQ3_XXS ||
|
13286
|
+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
13287
|
+
new_type == GGML_TYPE_IQ1_M) {
|
11948
13288
|
int nx = tensor->ne[0];
|
11949
13289
|
int ny = tensor->ne[1];
|
11950
13290
|
if (nx % QK_K != 0) {
|
@@ -11962,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11962
13302
|
case GGML_TYPE_IQ3_XXS:
|
11963
13303
|
case GGML_TYPE_IQ3_S:
|
11964
13304
|
case GGML_TYPE_IQ1_S:
|
13305
|
+
case GGML_TYPE_IQ1_M:
|
11965
13306
|
case GGML_TYPE_Q2_K:
|
11966
13307
|
case GGML_TYPE_Q3_K:
|
11967
13308
|
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
@@ -12043,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12043
13384
|
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
|
12044
13385
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
|
12045
13386
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
|
13387
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
|
12046
13388
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
|
12047
13389
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
12048
13390
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
@@ -12065,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12065
13407
|
constexpr bool use_mmap = false;
|
12066
13408
|
#endif
|
12067
13409
|
|
12068
|
-
|
12069
|
-
|
13410
|
+
llama_model_kv_override * kv_overrides = nullptr;
|
13411
|
+
if (params->kv_overrides) {
|
13412
|
+
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
13413
|
+
kv_overrides = v->data();
|
13414
|
+
}
|
13415
|
+
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
13416
|
+
ml.init_mappings(false); // no prefetching
|
12070
13417
|
|
12071
13418
|
llama_model model;
|
12072
13419
|
llm_load_arch(ml, model);
|
@@ -12090,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12090
13437
|
struct gguf_context * ctx_out = gguf_init_empty();
|
12091
13438
|
|
12092
13439
|
// copy the KV pairs from the input file
|
12093
|
-
gguf_set_kv (ctx_out, ml.
|
13440
|
+
gguf_set_kv (ctx_out, ml.meta);
|
12094
13441
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
12095
13442
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
12096
13443
|
|
13444
|
+
if (params->kv_overrides) {
|
13445
|
+
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
13446
|
+
for (auto & o : overrides) {
|
13447
|
+
if (o.key[0] == 0) break;
|
13448
|
+
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
13449
|
+
gguf_set_val_f32(ctx_out, o.key, o.float_value);
|
13450
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
13451
|
+
gguf_set_val_i32(ctx_out, o.key, o.int_value);
|
13452
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
13453
|
+
gguf_set_val_bool(ctx_out, o.key, o.bool_value);
|
13454
|
+
} else {
|
13455
|
+
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
13456
|
+
}
|
13457
|
+
}
|
13458
|
+
}
|
13459
|
+
|
12097
13460
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12098
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13461
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12099
13462
|
|
12100
13463
|
const std::string name = ggml_get_name(meta);
|
12101
13464
|
|
12102
13465
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
12103
13466
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
12104
13467
|
++qs.n_attention_wv;
|
12105
|
-
}
|
12106
|
-
else if (name.find("ffn_down") != std::string::npos) {
|
12107
|
-
++qs.n_ffn_down;
|
12108
|
-
}
|
12109
|
-
else if (name.find("ffn_gate") != std::string::npos) {
|
12110
|
-
++qs.n_ffn_gate;
|
12111
|
-
}
|
12112
|
-
else if (name.find("ffn_up") != std::string::npos) {
|
12113
|
-
++qs.n_ffn_up;
|
12114
|
-
}
|
12115
|
-
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13468
|
+
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
12116
13469
|
qs.has_output = true;
|
12117
13470
|
}
|
12118
13471
|
}
|
12119
|
-
|
12120
|
-
|
12121
|
-
|
12122
|
-
|
13472
|
+
|
13473
|
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
|
+
|
13475
|
+
// sanity checks
|
13476
|
+
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
12123
13477
|
|
12124
13478
|
size_t total_size_org = 0;
|
12125
13479
|
size_t total_size_new = 0;
|
@@ -12135,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12135
13489
|
|
12136
13490
|
// populate the original tensors so we get an initial meta data
|
12137
13491
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12138
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13492
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12139
13493
|
gguf_add_tensor(ctx_out, meta);
|
12140
13494
|
}
|
12141
13495
|
|
@@ -12149,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12149
13503
|
// placeholder for the meta data
|
12150
13504
|
::zeros(fout, meta_size);
|
12151
13505
|
|
13506
|
+
const auto tn = LLM_TN(model.arch);
|
13507
|
+
|
12152
13508
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12153
13509
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
12154
13510
|
|
@@ -12171,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12171
13527
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
12172
13528
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
12173
13529
|
|
12174
|
-
// quantize only 2D tensors
|
12175
|
-
quantize &= (ggml_n_dims(tensor)
|
13530
|
+
// quantize only 2D and 3D tensors (experts)
|
13531
|
+
quantize &= (ggml_n_dims(tensor) >= 2);
|
12176
13532
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
12177
13533
|
quantize &= !params->only_copy;
|
12178
13534
|
|
@@ -12201,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12201
13557
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
12202
13558
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
12203
13559
|
}
|
13560
|
+
else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
|
+
new_type = params->token_embedding_type;
|
13562
|
+
}
|
13563
|
+
else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
|
+
new_type = params->output_tensor_type;
|
13565
|
+
}
|
12204
13566
|
|
12205
13567
|
// If we've decided to quantize to the same type the tensor is already
|
12206
13568
|
// in then there's nothing to do.
|
@@ -12221,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12221
13583
|
if (it == imatrix_data->end()) {
|
12222
13584
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
12223
13585
|
} else {
|
12224
|
-
if (it->second.size() == (size_t)tensor->ne[0]) {
|
13586
|
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
12225
13587
|
imatrix = it->second.data();
|
12226
13588
|
} else {
|
12227
13589
|
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
12228
|
-
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
13590
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
13591
|
+
|
13592
|
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
13593
|
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
13594
|
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
13595
|
+
// tok_embd should be ignored in this case, since it always causes this warning
|
13596
|
+
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
13597
|
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
13598
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
13599
|
+
}
|
12229
13600
|
}
|
12230
13601
|
}
|
12231
13602
|
}
|
@@ -12233,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12233
13604
|
new_type == GGML_TYPE_IQ2_XS ||
|
12234
13605
|
new_type == GGML_TYPE_IQ2_S ||
|
12235
13606
|
new_type == GGML_TYPE_IQ1_S ||
|
13607
|
+
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
12236
13608
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
12237
13609
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
12238
13610
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -12261,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12261
13633
|
new_data = work.data();
|
12262
13634
|
|
12263
13635
|
const int n_per_row = tensor->ne[0];
|
12264
|
-
const int nrows =
|
13636
|
+
const int nrows = tensor->ne[1];
|
12265
13637
|
|
12266
13638
|
static const int min_chunk_size = 32 * 512;
|
12267
13639
|
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
12268
13640
|
|
12269
|
-
const int
|
13641
|
+
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
13642
|
+
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
12270
13643
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
12271
|
-
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
|
12272
13644
|
|
13645
|
+
// quantize each expert separately since they have different importance matrices
|
13646
|
+
new_size = 0;
|
13647
|
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
13648
|
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
13649
|
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
13650
|
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
13651
|
+
|
13652
|
+
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
13653
|
+
}
|
12273
13654
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
12274
13655
|
}
|
12275
13656
|
total_size_org += ggml_nbytes(tensor);
|
@@ -12340,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12340
13721
|
if (path_base_model) {
|
12341
13722
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
12342
13723
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
12343
|
-
ml->
|
13724
|
+
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
12344
13725
|
}
|
12345
13726
|
|
12346
13727
|
struct tensor_meta {
|
@@ -12461,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12461
13842
|
|
12462
13843
|
ggml_tensor * base_t;
|
12463
13844
|
if (ml) {
|
12464
|
-
if (
|
13845
|
+
if (!ml->get_tensor_meta(base_name.c_str())) {
|
12465
13846
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
12466
13847
|
return 1;
|
12467
13848
|
}
|
@@ -12645,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12645
14026
|
struct llama_model_quantize_params result = {
|
12646
14027
|
/*.nthread =*/ 0,
|
12647
14028
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
14029
|
+
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
14030
|
+
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
12648
14031
|
/*.allow_requantize =*/ false,
|
12649
14032
|
/*.quantize_output_tensor =*/ true,
|
12650
14033
|
/*.only_copy =*/ false,
|
12651
14034
|
/*.pure =*/ false,
|
12652
14035
|
/*.imatrix =*/ nullptr,
|
14036
|
+
/*.kv_overrides =*/ nullptr,
|
12653
14037
|
};
|
12654
14038
|
|
12655
14039
|
return result;
|
@@ -12658,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12658
14042
|
size_t llama_max_devices(void) {
|
12659
14043
|
#if defined(GGML_USE_METAL)
|
12660
14044
|
return 1;
|
12661
|
-
#elif defined(
|
14045
|
+
#elif defined(GGML_USE_CUDA)
|
12662
14046
|
return GGML_CUDA_MAX_DEVICES;
|
12663
14047
|
#elif defined(GGML_USE_SYCL)
|
12664
14048
|
return GGML_SYCL_MAX_DEVICES;
|
@@ -12678,8 +14062,8 @@ bool llama_supports_mlock(void) {
|
|
12678
14062
|
}
|
12679
14063
|
|
12680
14064
|
bool llama_supports_gpu_offload(void) {
|
12681
|
-
#if defined(
|
12682
|
-
defined(GGML_USE_SYCL)
|
14065
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
14066
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
12683
14067
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
12684
14068
|
return true;
|
12685
14069
|
#else
|
@@ -12786,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
|
|
12786
14170
|
const auto & hparams = model->hparams;
|
12787
14171
|
auto & cparams = ctx->cparams;
|
12788
14172
|
|
12789
|
-
|
14173
|
+
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
12790
14174
|
cparams.n_threads = params.n_threads;
|
12791
14175
|
cparams.n_threads_batch = params.n_threads_batch;
|
12792
14176
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
@@ -12802,6 +14186,9 @@ struct llama_context * llama_new_context_with_model(
|
|
12802
14186
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
12803
14187
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
12804
14188
|
|
14189
|
+
// this is necessary due to kv_self.n being padded later during inference
|
14190
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
|
14191
|
+
|
12805
14192
|
// with causal attention, the batch size is limited by the context size
|
12806
14193
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
12807
14194
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
@@ -12881,32 +14268,43 @@ struct llama_context * llama_new_context_with_model(
|
|
12881
14268
|
}
|
12882
14269
|
ctx->backends.push_back(ctx->backend_metal);
|
12883
14270
|
}
|
12884
|
-
#elif defined(
|
12885
|
-
if (model->
|
14271
|
+
#elif defined(GGML_USE_CUDA)
|
14272
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
12886
14273
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12887
|
-
|
12888
|
-
|
14274
|
+
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
14275
|
+
if (backend == nullptr) {
|
14276
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
14277
|
+
llama_free(ctx);
|
14278
|
+
return nullptr;
|
14279
|
+
}
|
14280
|
+
ctx->backends.push_back(backend);
|
14281
|
+
} else {
|
14282
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
14283
|
+
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
14284
|
+
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
12889
14285
|
if (backend == nullptr) {
|
12890
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__,
|
14286
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
12891
14287
|
llama_free(ctx);
|
12892
14288
|
return nullptr;
|
12893
14289
|
}
|
12894
14290
|
ctx->backends.push_back(backend);
|
12895
|
-
} else {
|
12896
|
-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
12897
|
-
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
12898
|
-
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
12899
|
-
if (backend == nullptr) {
|
12900
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
12901
|
-
llama_free(ctx);
|
12902
|
-
return nullptr;
|
12903
|
-
}
|
12904
|
-
ctx->backends.push_back(backend);
|
12905
|
-
}
|
12906
14291
|
}
|
12907
14292
|
}
|
12908
14293
|
#elif defined(GGML_USE_VULKAN)
|
12909
|
-
if (model->
|
14294
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14295
|
+
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
14296
|
+
llama_free(ctx);
|
14297
|
+
return nullptr;
|
14298
|
+
}
|
14299
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
14300
|
+
ggml_backend_t backend = ggml_backend_vk_init(0);
|
14301
|
+
if (backend == nullptr) {
|
14302
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
14303
|
+
llama_free(ctx);
|
14304
|
+
return nullptr;
|
14305
|
+
}
|
14306
|
+
ctx->backends.push_back(backend);
|
14307
|
+
} else {
|
12910
14308
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
12911
14309
|
ggml_backend_t backend = ggml_backend_vk_init(device);
|
12912
14310
|
if (backend == nullptr) {
|
@@ -12918,31 +14316,28 @@ struct llama_context * llama_new_context_with_model(
|
|
12918
14316
|
}
|
12919
14317
|
}
|
12920
14318
|
#elif defined(GGML_USE_SYCL)
|
12921
|
-
|
12922
|
-
|
12923
|
-
|
12924
|
-
|
12925
|
-
|
14319
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
14320
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14321
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
14322
|
+
if (backend == nullptr) {
|
14323
|
+
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
14324
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
14325
|
+
llama_free(ctx);
|
14326
|
+
return nullptr;
|
14327
|
+
}
|
14328
|
+
ctx->backends.push_back(backend);
|
14329
|
+
} else {
|
14330
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
14331
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
14332
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12926
14333
|
if (backend == nullptr) {
|
12927
|
-
|
14334
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
14335
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
14336
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
12928
14337
|
llama_free(ctx);
|
12929
14338
|
return nullptr;
|
12930
14339
|
}
|
12931
14340
|
ctx->backends.push_back(backend);
|
12932
|
-
} else {
|
12933
|
-
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
12934
|
-
int id_list[GGML_SYCL_MAX_DEVICES];
|
12935
|
-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
12936
|
-
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
12937
|
-
int device_id = id_list[i];
|
12938
|
-
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12939
|
-
if (backend == nullptr) {
|
12940
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
12941
|
-
llama_free(ctx);
|
12942
|
-
return nullptr;
|
12943
|
-
}
|
12944
|
-
ctx->backends.push_back(backend);
|
12945
|
-
}
|
12946
14341
|
}
|
12947
14342
|
}
|
12948
14343
|
#elif defined(GGML_USE_KOMPUTE)
|
@@ -12990,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
|
|
12990
14385
|
|
12991
14386
|
// graph outputs buffer
|
12992
14387
|
{
|
12993
|
-
// resized during inference
|
12994
|
-
ctx
|
12995
|
-
|
12996
|
-
|
12997
|
-
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
|
12998
|
-
|
12999
|
-
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
|
13000
|
-
if (ctx->buf_output == nullptr) {
|
13001
|
-
LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
|
14388
|
+
// resized during inference when a batch uses more outputs
|
14389
|
+
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
14390
|
+
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
13002
14391
|
llama_free(ctx);
|
13003
14392
|
return nullptr;
|
13004
14393
|
}
|
13005
|
-
ggml_backend_buffer_clear(ctx->buf_output, 0);
|
13006
|
-
|
13007
|
-
|
13008
|
-
ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
|
13009
|
-
if (params.embeddings) {
|
13010
|
-
ctx->embd = ctx->logits + ctx->logits_size;
|
13011
|
-
}
|
13012
14394
|
|
13013
14395
|
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
|
13014
14396
|
ggml_backend_buffer_name(ctx->buf_output),
|
@@ -13033,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13033
14415
|
|
13034
14416
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
13035
14417
|
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
13036
|
-
#ifndef
|
14418
|
+
#ifndef GGML_USE_CUDA
|
13037
14419
|
// pipeline parallelism requires support for async compute and events
|
13038
14420
|
// currently this is only implemented in the CUDA backend
|
13039
14421
|
pipeline_parallel = false;
|
@@ -13061,14 +14443,17 @@ struct llama_context * llama_new_context_with_model(
|
|
13061
14443
|
ggml_backend_t backend = ctx->backends[i];
|
13062
14444
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
13063
14445
|
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
13064
|
-
|
13065
|
-
|
13066
|
-
|
14446
|
+
if (size > 1) {
|
14447
|
+
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
14448
|
+
ggml_backend_buft_name(buft),
|
14449
|
+
size / 1024.0 / 1024.0);
|
14450
|
+
}
|
13067
14451
|
}
|
13068
14452
|
|
13069
14453
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
13070
14454
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
13071
|
-
LLAMA_LOG_INFO("%s: graph
|
14455
|
+
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
14456
|
+
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
13072
14457
|
}
|
13073
14458
|
}
|
13074
14459
|
|
@@ -13138,10 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
13138
14523
|
case LLM_ARCH_ORION:
|
13139
14524
|
case LLM_ARCH_INTERNLM2:
|
13140
14525
|
case LLM_ARCH_MINICPM:
|
14526
|
+
case LLM_ARCH_XVERSE:
|
14527
|
+
case LLM_ARCH_COMMAND_R:
|
13141
14528
|
return LLAMA_ROPE_TYPE_NORM;
|
13142
14529
|
|
13143
14530
|
// the pairs of head values are offset by n_rot/2
|
13144
14531
|
case LLM_ARCH_FALCON:
|
14532
|
+
case LLM_ARCH_GROK:
|
13145
14533
|
case LLM_ARCH_PERSIMMON:
|
13146
14534
|
case LLM_ARCH_BERT:
|
13147
14535
|
case LLM_ARCH_NOMIC_BERT:
|
@@ -13174,6 +14562,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
|
|
13174
14562
|
return model->hparams.n_embd;
|
13175
14563
|
}
|
13176
14564
|
|
14565
|
+
int32_t llama_n_layer(const struct llama_model * model) {
|
14566
|
+
return model->hparams.n_layer;
|
14567
|
+
}
|
14568
|
+
|
13177
14569
|
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
13178
14570
|
return model->hparams.rope_freq_scale_train;
|
13179
14571
|
}
|
@@ -13273,6 +14665,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
|
|
13273
14665
|
}
|
13274
14666
|
}
|
13275
14667
|
|
14668
|
+
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
14669
|
+
GGML_ASSERT(cvec.tensors.empty());
|
14670
|
+
GGML_ASSERT(cvec.ctxs.empty());
|
14671
|
+
GGML_ASSERT(cvec.bufs.empty());
|
14672
|
+
|
14673
|
+
// count layer buffer types
|
14674
|
+
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
14675
|
+
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
|
14676
|
+
buft_layer_count[model.buft_layer[i].buft]++;
|
14677
|
+
}
|
14678
|
+
|
14679
|
+
// allocate contexts
|
14680
|
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
14681
|
+
for (auto & it : buft_layer_count) {
|
14682
|
+
int n_layers = it.second;
|
14683
|
+
struct ggml_init_params params = {
|
14684
|
+
/*.mem_size =*/ n_layers * ggml_tensor_overhead(),
|
14685
|
+
/*.mem_buffer =*/ NULL,
|
14686
|
+
/*.no_alloc =*/ true,
|
14687
|
+
};
|
14688
|
+
ggml_context * ctx = ggml_init(params);
|
14689
|
+
if (!ctx) {
|
14690
|
+
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
14691
|
+
return 1;
|
14692
|
+
}
|
14693
|
+
ctx_map[it.first] = ctx;
|
14694
|
+
}
|
14695
|
+
|
14696
|
+
// make tensors
|
14697
|
+
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
14698
|
+
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
14699
|
+
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
14700
|
+
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
14701
|
+
cvec.tensors.push_back(tensor);
|
14702
|
+
}
|
14703
|
+
|
14704
|
+
// allocate tensors / buffers and zero
|
14705
|
+
for (auto it : ctx_map) {
|
14706
|
+
ggml_backend_buffer_type_t buft = it.first;
|
14707
|
+
ggml_context * ctx = it.second;
|
14708
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
14709
|
+
if (!buf) {
|
14710
|
+
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
|
14711
|
+
return false;
|
14712
|
+
}
|
14713
|
+
ggml_backend_buffer_clear(buf, 0);
|
14714
|
+
cvec.ctxs.push_back(ctx);
|
14715
|
+
cvec.bufs.push_back(buf);
|
14716
|
+
}
|
14717
|
+
|
14718
|
+
return true;
|
14719
|
+
}
|
14720
|
+
|
14721
|
+
int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
|
14722
|
+
const llama_model & model = lctx->model;
|
14723
|
+
llama_control_vector & cvec = lctx->cvec;
|
14724
|
+
|
14725
|
+
if (data == nullptr) {
|
14726
|
+
// disable the current control vector (but leave allocated for later)
|
14727
|
+
cvec.layer_start = -1;
|
14728
|
+
cvec.layer_end = -1;
|
14729
|
+
return 0;
|
14730
|
+
}
|
14731
|
+
|
14732
|
+
if (n_embd != (int) model.hparams.n_embd) {
|
14733
|
+
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
14734
|
+
return 1;
|
14735
|
+
}
|
14736
|
+
|
14737
|
+
if (cvec.tensors.empty()) {
|
14738
|
+
if (!llama_control_vector_init(cvec, model)) {
|
14739
|
+
return 1;
|
14740
|
+
}
|
14741
|
+
}
|
14742
|
+
|
14743
|
+
cvec.layer_start = il_start;
|
14744
|
+
cvec.layer_end = il_end;
|
14745
|
+
|
14746
|
+
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
14747
|
+
assert(cvec.tensors[il] != nullptr);
|
14748
|
+
|
14749
|
+
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
14750
|
+
if (off + n_embd <= len) {
|
14751
|
+
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
14752
|
+
}
|
14753
|
+
}
|
14754
|
+
|
14755
|
+
return 0;
|
14756
|
+
}
|
14757
|
+
|
13276
14758
|
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
13277
14759
|
struct llama_kv_cache_view result = {
|
13278
14760
|
/*.n_cells = */ 0,
|
@@ -13426,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
13426
14908
|
|
13427
14909
|
// Returns the *maximum* size of the state
|
13428
14910
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
14911
|
+
const auto & cparams = ctx->cparams;
|
14912
|
+
const auto & hparams = ctx->model.hparams;
|
14913
|
+
|
13429
14914
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
13430
14915
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
13431
14916
|
const size_t s_rng_size = sizeof(size_t);
|
13432
14917
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
14918
|
+
const size_t s_n_outputs = sizeof(size_t);
|
14919
|
+
// assume worst case for outputs although only currently set ones are serialized
|
14920
|
+
const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
|
13433
14921
|
const size_t s_logits_size = sizeof(size_t);
|
13434
|
-
|
13435
|
-
const size_t s_logits = ctx->logits_size * sizeof(float);
|
14922
|
+
const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
|
13436
14923
|
const size_t s_embedding_size = sizeof(size_t);
|
13437
|
-
const size_t s_embedding = ctx->embd_size * sizeof(float);
|
14924
|
+
const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
|
13438
14925
|
const size_t s_kv_buf_size = sizeof(size_t);
|
13439
14926
|
const size_t s_kv_head = sizeof(uint32_t);
|
13440
14927
|
const size_t s_kv_size = sizeof(uint32_t);
|
13441
14928
|
const size_t s_kv_used = sizeof(uint32_t);
|
13442
14929
|
const size_t s_kv = ctx->kv_self.total_size();
|
13443
|
-
|
13444
|
-
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
14930
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
13445
14931
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
13446
14932
|
|
13447
14933
|
const size_t s_total = (
|
13448
14934
|
+ s_rng_size
|
13449
14935
|
+ s_rng
|
14936
|
+
+ s_n_outputs
|
14937
|
+
+ s_output_pos
|
13450
14938
|
+ s_logits_size
|
13451
14939
|
+ s_logits
|
13452
14940
|
+ s_embedding_size
|
@@ -13521,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13521
15009
|
std::ostringstream rng_ss;
|
13522
15010
|
rng_ss << ctx->rng;
|
13523
15011
|
|
13524
|
-
const std::string & rng_str
|
15012
|
+
const std::string & rng_str = rng_ss.str();
|
13525
15013
|
const size_t rng_size = rng_str.size();
|
13526
15014
|
|
13527
15015
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
@@ -13530,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13530
15018
|
data_ctx->write(rng_str.data(), rng_size);
|
13531
15019
|
}
|
13532
15020
|
|
13533
|
-
// copy
|
15021
|
+
// copy outputs
|
13534
15022
|
{
|
13535
|
-
|
15023
|
+
// Can't use ctx->n_outputs because it's not for the
|
15024
|
+
// entire last batch when n_ubatch is smaller than n_batch
|
15025
|
+
size_t n_outputs = 0;
|
13536
15026
|
|
13537
|
-
|
15027
|
+
// copy output ids
|
15028
|
+
{
|
15029
|
+
std::vector<int32_t> output_pos;
|
13538
15030
|
|
13539
|
-
|
13540
|
-
|
15031
|
+
const size_t n_batch = ctx->cparams.n_batch;
|
15032
|
+
const auto & output_ids = ctx->output_ids;
|
15033
|
+
|
15034
|
+
output_pos.resize(ctx->output_size);
|
15035
|
+
|
15036
|
+
// build a more compact representation of the output ids
|
15037
|
+
for (size_t i = 0; i < n_batch; ++i) {
|
15038
|
+
// map an output id to a position in the batch
|
15039
|
+
int32_t pos = output_ids[i];
|
15040
|
+
if (pos >= 0) {
|
15041
|
+
if ((size_t) pos >= n_outputs) {
|
15042
|
+
n_outputs = pos + 1;
|
15043
|
+
}
|
15044
|
+
GGML_ASSERT((size_t) pos < ctx->output_size);
|
15045
|
+
output_pos[pos] = i;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
data_ctx->write(&n_outputs, sizeof(n_outputs));
|
15050
|
+
|
15051
|
+
if (n_outputs) {
|
15052
|
+
data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
|
15053
|
+
}
|
13541
15054
|
}
|
13542
|
-
}
|
13543
15055
|
|
13544
|
-
|
13545
|
-
|
13546
|
-
|
15056
|
+
// copy logits
|
15057
|
+
{
|
15058
|
+
const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
|
13547
15059
|
|
13548
|
-
|
15060
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
13549
15061
|
|
13550
|
-
|
13551
|
-
|
15062
|
+
if (logits_size) {
|
15063
|
+
data_ctx->write(ctx->logits, logits_size * sizeof(float));
|
15064
|
+
}
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
// copy embeddings
|
15068
|
+
{
|
15069
|
+
const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
|
15070
|
+
|
15071
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
15072
|
+
|
15073
|
+
if (embeddings_size) {
|
15074
|
+
data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
|
15075
|
+
}
|
13552
15076
|
}
|
13553
15077
|
}
|
13554
15078
|
|
@@ -13561,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13561
15085
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13562
15086
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
13563
15087
|
|
13564
|
-
|
15088
|
+
// NOTE: kv_size and kv_buf_size are mostly used for sanity checks
|
13565
15089
|
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
13566
15090
|
const uint32_t kv_size = kv_self.size;
|
15091
|
+
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
13567
15092
|
const uint32_t kv_used = kv_self.used;
|
13568
15093
|
|
13569
15094
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
@@ -13572,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13572
15097
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
13573
15098
|
|
13574
15099
|
if (kv_buf_size) {
|
15100
|
+
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
15101
|
+
|
13575
15102
|
std::vector<uint8_t> tmp_buf;
|
13576
15103
|
for (int il = 0; il < (int) n_layer; ++il) {
|
13577
15104
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -13601,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13601
15128
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
13602
15129
|
}
|
13603
15130
|
}
|
15131
|
+
GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
|
13604
15132
|
}
|
13605
15133
|
|
13606
15134
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
@@ -13645,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13645
15173
|
GGML_ASSERT(!rng_ss.fail());
|
13646
15174
|
}
|
13647
15175
|
|
15176
|
+
// set output ids
|
15177
|
+
{
|
15178
|
+
size_t n_outputs;
|
15179
|
+
std::vector<int32_t> output_pos;
|
15180
|
+
|
15181
|
+
memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
|
15182
|
+
|
15183
|
+
GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
|
15184
|
+
|
15185
|
+
if (n_outputs) {
|
15186
|
+
output_pos.resize(n_outputs);
|
15187
|
+
memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
|
15188
|
+
inp += n_outputs * sizeof(int32_t);
|
15189
|
+
|
15190
|
+
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
|
15191
|
+
int32_t id = output_pos[i];
|
15192
|
+
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15193
|
+
ctx->output_ids[id] = i;
|
15194
|
+
}
|
15195
|
+
}
|
15196
|
+
}
|
15197
|
+
|
13648
15198
|
// set logits
|
13649
15199
|
{
|
13650
15200
|
size_t logits_size;
|
@@ -13665,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13665
15215
|
|
13666
15216
|
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
13667
15217
|
|
13668
|
-
GGML_ASSERT(ctx->embd_size
|
15218
|
+
GGML_ASSERT(ctx->embd_size >= embeddings_size);
|
13669
15219
|
|
13670
15220
|
if (embeddings_size) {
|
13671
15221
|
memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
|
@@ -13692,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13692
15242
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
13693
15243
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
13694
15244
|
|
15245
|
+
if (kv_self.size != kv_size) {
|
15246
|
+
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
15247
|
+
GGML_ASSERT(kv_self.size >= kv_head);
|
15248
|
+
|
15249
|
+
LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
|
15250
|
+
__func__, kv_head, kv_size, kv_self.size);
|
15251
|
+
}
|
15252
|
+
|
13695
15253
|
if (kv_buf_size) {
|
13696
|
-
|
15254
|
+
const size_t pre_kv_buf_size = inp - src;
|
15255
|
+
|
15256
|
+
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
|
13697
15257
|
|
13698
15258
|
for (int il = 0; il < (int) n_layer; ++il) {
|
13699
15259
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -13713,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13713
15273
|
|
13714
15274
|
// v is not contiguous, copy row by row
|
13715
15275
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
13716
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
15276
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
|
13717
15277
|
|
13718
15278
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
13719
15279
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
13720
15280
|
inp += v_row_size;
|
13721
15281
|
}
|
13722
15282
|
}
|
15283
|
+
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
13723
15284
|
}
|
13724
15285
|
|
13725
|
-
|
15286
|
+
llama_kv_cache_clear(ctx);
|
13726
15287
|
|
13727
15288
|
ctx->kv_self.head = kv_head;
|
13728
|
-
ctx->kv_self.size = kv_size;
|
13729
15289
|
ctx->kv_self.used = kv_used;
|
13730
15290
|
|
13731
|
-
ctx->kv_self.cells.resize(kv_size);
|
13732
|
-
|
13733
15291
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
13734
15292
|
llama_pos pos;
|
13735
15293
|
size_t seq_id_size;
|
@@ -13746,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13746
15304
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
13747
15305
|
}
|
13748
15306
|
}
|
13749
|
-
|
13750
|
-
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
13751
|
-
ctx->kv_self.cells[i].pos = -1;
|
13752
|
-
ctx->kv_self.cells[i].seq_id.clear();
|
13753
|
-
}
|
13754
15307
|
}
|
13755
15308
|
|
13756
15309
|
const size_t nread = inp - src;
|
@@ -13956,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
13956
15509
|
}
|
13957
15510
|
|
13958
15511
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
13959
|
-
assert(ctx->logits_valid.at(i));
|
13960
|
-
|
13961
15512
|
llama_synchronize(ctx);
|
13962
15513
|
|
13963
|
-
|
15514
|
+
try {
|
15515
|
+
if (ctx->logits == nullptr) {
|
15516
|
+
throw std::runtime_error("no logits");
|
15517
|
+
}
|
15518
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15519
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15520
|
+
}
|
15521
|
+
const int32_t j = ctx->output_ids[i];
|
15522
|
+
|
15523
|
+
if (j < 0) {
|
15524
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
|
+
}
|
15526
|
+
if ((size_t) j >= ctx->output_size) {
|
15527
|
+
// This should not happen
|
15528
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15529
|
+
}
|
15530
|
+
|
15531
|
+
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
15532
|
+
} catch (const std::exception & err) {
|
15533
|
+
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
15534
|
+
#ifndef NDEBUG
|
15535
|
+
GGML_ASSERT(false);
|
15536
|
+
#endif
|
15537
|
+
return nullptr;
|
15538
|
+
}
|
13964
15539
|
}
|
13965
15540
|
|
13966
15541
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
@@ -13972,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
13972
15547
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
13973
15548
|
llama_synchronize(ctx);
|
13974
15549
|
|
13975
|
-
|
15550
|
+
try {
|
15551
|
+
if (ctx->embd == nullptr) {
|
15552
|
+
throw std::runtime_error("no embeddings");
|
15553
|
+
}
|
15554
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15555
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15556
|
+
}
|
15557
|
+
const int32_t j = ctx->output_ids[i];
|
15558
|
+
|
15559
|
+
if (j < 0) {
|
15560
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
|
+
}
|
15562
|
+
if ((size_t) j >= ctx->output_size) {
|
15563
|
+
// This should not happen
|
15564
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15565
|
+
}
|
15566
|
+
|
15567
|
+
return ctx->embd + j*ctx->model.hparams.n_embd;
|
15568
|
+
} catch (const std::exception & err) {
|
15569
|
+
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
15570
|
+
#ifndef NDEBUG
|
15571
|
+
GGML_ASSERT(false);
|
15572
|
+
#endif
|
15573
|
+
return nullptr;
|
15574
|
+
}
|
13976
15575
|
}
|
13977
15576
|
|
13978
15577
|
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
@@ -14262,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
|
|
14262
15861
|
ss << message->content << "</s>";
|
14263
15862
|
}
|
14264
15863
|
}
|
15864
|
+
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
|
15865
|
+
// openchat/openchat-3.5-0106,
|
15866
|
+
for (auto message : chat) {
|
15867
|
+
std::string role(message->role);
|
15868
|
+
if (role == "system") {
|
15869
|
+
ss << message->content << "<|end_of_turn|>";
|
15870
|
+
} else {
|
15871
|
+
role[0] = toupper(role[0]);
|
15872
|
+
ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
|
15873
|
+
}
|
15874
|
+
}
|
15875
|
+
if (add_ass) {
|
15876
|
+
ss << "GPT4 Correct Assistant:";
|
15877
|
+
}
|
15878
|
+
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
|
15879
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
15880
|
+
for (auto message : chat) {
|
15881
|
+
std::string role(message->role);
|
15882
|
+
if (role == "system") {
|
15883
|
+
// Orca-Vicuna variant uses a system prefix
|
15884
|
+
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
|
15885
|
+
ss << "SYSTEM: " << message->content << "\n";
|
15886
|
+
} else {
|
15887
|
+
ss << message->content << "\n\n";
|
15888
|
+
}
|
15889
|
+
} else if (role == "user") {
|
15890
|
+
ss << "USER: " << message->content << "\n";
|
15891
|
+
} else if (role == "assistant") {
|
15892
|
+
ss << "ASSISTANT: " << message->content << "</s>\n";
|
15893
|
+
}
|
15894
|
+
}
|
15895
|
+
if (add_ass) {
|
15896
|
+
ss << "ASSISTANT:";
|
15897
|
+
}
|
15898
|
+
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
|
15899
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
15900
|
+
for (auto message : chat) {
|
15901
|
+
std::string role(message->role);
|
15902
|
+
if (role == "system") {
|
15903
|
+
ss << message->content;
|
15904
|
+
} else if (role == "user") {
|
15905
|
+
ss << "### Instruction:\n" << message->content << "\n";
|
15906
|
+
} else if (role == "assistant") {
|
15907
|
+
ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
|
15908
|
+
}
|
15909
|
+
}
|
15910
|
+
if (add_ass) {
|
15911
|
+
ss << "### Response:\n";
|
15912
|
+
}
|
14265
15913
|
} else {
|
14266
15914
|
// template not supported
|
14267
15915
|
return -1;
|
@@ -14311,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
14311
15959
|
return res;
|
14312
15960
|
}
|
14313
15961
|
|
15962
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
15963
|
+
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
15964
|
+
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
15965
|
+
return strlen(split_path);
|
15966
|
+
}
|
15967
|
+
return 0;
|
15968
|
+
}
|
15969
|
+
|
15970
|
+
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
15971
|
+
std::string str_split_path(split_path);
|
15972
|
+
char postfix[32];
|
15973
|
+
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
15974
|
+
std::string str_postfix(postfix);
|
15975
|
+
|
15976
|
+
// check if dest ends with postfix
|
15977
|
+
int size_prefix = str_split_path.size() - str_postfix.size();
|
15978
|
+
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
15979
|
+
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
15980
|
+
return size_prefix;
|
15981
|
+
}
|
15982
|
+
|
15983
|
+
return 0;
|
15984
|
+
}
|
15985
|
+
|
14314
15986
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
14315
15987
|
struct llama_timings result = {
|
14316
15988
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|