llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
-
#ifdef
|
10
|
+
#ifdef GGML_USE_CUDA
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
@@ -52,12 +52,16 @@
|
|
52
52
|
#define NOMINMAX
|
53
53
|
#endif
|
54
54
|
#include <windows.h>
|
55
|
+
#ifndef PATH_MAX
|
56
|
+
#define PATH_MAX MAX_PATH
|
57
|
+
#endif
|
55
58
|
#include <io.h>
|
56
59
|
#endif
|
57
60
|
|
58
61
|
#include <algorithm>
|
59
62
|
#include <array>
|
60
63
|
#include <cassert>
|
64
|
+
#include <cctype>
|
61
65
|
#include <cfloat>
|
62
66
|
#include <cinttypes>
|
63
67
|
#include <climits>
|
@@ -68,7 +72,6 @@
|
|
68
72
|
#include <cstdio>
|
69
73
|
#include <cstring>
|
70
74
|
#include <ctime>
|
71
|
-
#include <cwctype>
|
72
75
|
#include <forward_list>
|
73
76
|
#include <fstream>
|
74
77
|
#include <functional>
|
@@ -192,6 +195,7 @@ enum llm_arch {
|
|
192
195
|
LLM_ARCH_LLAMA,
|
193
196
|
LLM_ARCH_FALCON,
|
194
197
|
LLM_ARCH_BAICHUAN,
|
198
|
+
LLM_ARCH_GROK,
|
195
199
|
LLM_ARCH_GPT2,
|
196
200
|
LLM_ARCH_GPTJ,
|
197
201
|
LLM_ARCH_GPTNEOX,
|
@@ -214,12 +218,15 @@ enum llm_arch {
|
|
214
218
|
LLM_ARCH_GEMMA,
|
215
219
|
LLM_ARCH_STARCODER2,
|
216
220
|
LLM_ARCH_MAMBA,
|
221
|
+
LLM_ARCH_XVERSE,
|
222
|
+
LLM_ARCH_COMMAND_R,
|
217
223
|
LLM_ARCH_UNKNOWN,
|
218
224
|
};
|
219
225
|
|
220
226
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
221
227
|
{ LLM_ARCH_LLAMA, "llama" },
|
222
228
|
{ LLM_ARCH_FALCON, "falcon" },
|
229
|
+
{ LLM_ARCH_GROK, "grok" },
|
223
230
|
{ LLM_ARCH_GPT2, "gpt2" },
|
224
231
|
{ LLM_ARCH_GPTJ, "gptj" },
|
225
232
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
@@ -243,6 +250,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
243
250
|
{ LLM_ARCH_GEMMA, "gemma" },
|
244
251
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
245
252
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
254
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
246
255
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
247
256
|
};
|
248
257
|
|
@@ -268,6 +277,7 @@ enum llm_kv {
|
|
268
277
|
LLM_KV_EXPERT_COUNT,
|
269
278
|
LLM_KV_EXPERT_USED_COUNT,
|
270
279
|
LLM_KV_POOLING_TYPE,
|
280
|
+
LLM_KV_LOGIT_SCALE,
|
271
281
|
|
272
282
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
273
283
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -287,6 +297,10 @@ enum llm_kv {
|
|
287
297
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
288
298
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
289
299
|
|
300
|
+
LLM_KV_SPLIT_NO,
|
301
|
+
LLM_KV_SPLIT_COUNT,
|
302
|
+
LLM_KV_SPLIT_TENSORS_COUNT,
|
303
|
+
|
290
304
|
LLM_KV_SSM_INNER_SIZE,
|
291
305
|
LLM_KV_SSM_CONV_KERNEL,
|
292
306
|
LLM_KV_SSM_STATE_SIZE,
|
@@ -332,6 +346,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
332
346
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
333
347
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
334
348
|
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
349
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
335
350
|
|
336
351
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
337
352
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -351,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
351
366
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
352
367
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
353
368
|
|
369
|
+
{ LLM_KV_SPLIT_NO, "split.no" },
|
370
|
+
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
371
|
+
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
|
372
|
+
|
354
373
|
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
355
374
|
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
356
375
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
@@ -407,9 +426,12 @@ enum llm_tensor {
|
|
407
426
|
LLM_TENSOR_FFN_DOWN,
|
408
427
|
LLM_TENSOR_FFN_UP,
|
409
428
|
LLM_TENSOR_FFN_ACT,
|
410
|
-
LLM_TENSOR_FFN_DOWN_EXP,
|
429
|
+
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
411
430
|
LLM_TENSOR_FFN_GATE_EXP,
|
412
431
|
LLM_TENSOR_FFN_UP_EXP,
|
432
|
+
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
433
|
+
LLM_TENSOR_FFN_GATE_EXPS,
|
434
|
+
LLM_TENSOR_FFN_UP_EXPS,
|
413
435
|
LLM_TENSOR_ATTN_Q_NORM,
|
414
436
|
LLM_TENSOR_ATTN_K_NORM,
|
415
437
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -444,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
444
466
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
445
467
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
446
468
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
469
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
470
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
471
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
447
472
|
},
|
448
473
|
},
|
449
474
|
{
|
@@ -479,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
479
504
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
480
505
|
},
|
481
506
|
},
|
507
|
+
{
|
508
|
+
LLM_ARCH_GROK,
|
509
|
+
{
|
510
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
511
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
512
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
513
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
514
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
515
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
516
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
517
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
518
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
519
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
520
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
521
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
522
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
523
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
524
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
525
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
526
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
527
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
528
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
529
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
530
|
+
},
|
531
|
+
},
|
482
532
|
{
|
483
533
|
LLM_ARCH_GPT2,
|
484
534
|
{
|
@@ -536,6 +586,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
536
586
|
{
|
537
587
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
538
588
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
589
|
+
{ LLM_TENSOR_OUTPUT, "output"},
|
539
590
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
540
591
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
541
592
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
@@ -543,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
543
594
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
544
595
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
545
596
|
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
597
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
598
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
599
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
546
600
|
},
|
547
601
|
},
|
548
602
|
{
|
@@ -838,6 +892,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
838
892
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
839
893
|
},
|
840
894
|
},
|
895
|
+
{
|
896
|
+
LLM_ARCH_XVERSE,
|
897
|
+
{
|
898
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
899
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
900
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
901
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
902
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
903
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
904
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
905
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
906
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
907
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
908
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
909
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
910
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
911
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
912
|
+
},
|
913
|
+
},
|
914
|
+
{
|
915
|
+
LLM_ARCH_COMMAND_R,
|
916
|
+
{
|
917
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
918
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
919
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
920
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
921
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
922
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
923
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
924
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
925
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
926
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
927
|
+
},
|
928
|
+
},
|
841
929
|
{
|
842
930
|
LLM_ARCH_UNKNOWN,
|
843
931
|
{
|
@@ -1010,7 +1098,7 @@ struct llama_file {
|
|
1010
1098
|
size_t size;
|
1011
1099
|
|
1012
1100
|
llama_file(const char * fname, const char * mode) {
|
1013
|
-
fp =
|
1101
|
+
fp = ggml_fopen(fname, mode);
|
1014
1102
|
if (fp == NULL) {
|
1015
1103
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1016
1104
|
}
|
@@ -1079,6 +1167,7 @@ struct llama_file {
|
|
1079
1167
|
}
|
1080
1168
|
}
|
1081
1169
|
};
|
1170
|
+
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1082
1171
|
|
1083
1172
|
struct llama_mmap {
|
1084
1173
|
void * addr;
|
@@ -1279,6 +1368,7 @@ struct llama_mmap {
|
|
1279
1368
|
}
|
1280
1369
|
#endif
|
1281
1370
|
};
|
1371
|
+
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
1282
1372
|
|
1283
1373
|
// Represents some region of memory being locked using mlock or VirtualLock;
|
1284
1374
|
// will automatically unlock on destruction.
|
@@ -1428,6 +1518,7 @@ struct llama_mlock {
|
|
1428
1518
|
static void raw_unlock(const void * addr, size_t len) {}
|
1429
1519
|
#endif
|
1430
1520
|
};
|
1521
|
+
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1431
1522
|
|
1432
1523
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1433
1524
|
std::vector<char> result(8, 0);
|
@@ -1447,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1447
1538
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
1448
1539
|
ggml_backend_buffer_type_t buft = nullptr;
|
1449
1540
|
|
1450
|
-
#if defined(
|
1541
|
+
#if defined(GGML_USE_CUDA)
|
1451
1542
|
// host buffers should only be used when data is expected to be copied to/from the GPU
|
1452
1543
|
if (host_buffer) {
|
1453
1544
|
buft = ggml_backend_cuda_host_buffer_type();
|
@@ -1477,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1477
1568
|
|
1478
1569
|
#ifdef GGML_USE_METAL
|
1479
1570
|
buft = ggml_backend_metal_buffer_type();
|
1480
|
-
#elif defined(
|
1571
|
+
#elif defined(GGML_USE_CUDA)
|
1481
1572
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1482
1573
|
#elif defined(GGML_USE_VULKAN)
|
1483
1574
|
buft = ggml_backend_vk_buffer_type(gpu);
|
@@ -1503,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1503
1594
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1504
1595
|
ggml_backend_buffer_type_t buft = nullptr;
|
1505
1596
|
|
1506
|
-
#ifdef
|
1597
|
+
#ifdef GGML_USE_CUDA
|
1507
1598
|
if (ggml_backend_cuda_get_device_count() > 1) {
|
1508
1599
|
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1509
1600
|
}
|
@@ -1524,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1524
1615
|
}
|
1525
1616
|
|
1526
1617
|
static size_t llama_get_device_count() {
|
1527
|
-
#if defined(
|
1618
|
+
#if defined(GGML_USE_CUDA)
|
1528
1619
|
return ggml_backend_cuda_get_device_count();
|
1529
1620
|
#elif defined(GGML_USE_SYCL)
|
1530
1621
|
return ggml_backend_sycl_get_device_count();
|
@@ -1536,7 +1627,7 @@ static size_t llama_get_device_count() {
|
|
1536
1627
|
}
|
1537
1628
|
|
1538
1629
|
static size_t llama_get_device_memory(int device) {
|
1539
|
-
#if defined(
|
1630
|
+
#if defined(GGML_USE_CUDA)
|
1540
1631
|
size_t total;
|
1541
1632
|
size_t free;
|
1542
1633
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
@@ -1597,9 +1688,11 @@ enum e_model {
|
|
1597
1688
|
MODEL_20B,
|
1598
1689
|
MODEL_30B,
|
1599
1690
|
MODEL_34B,
|
1691
|
+
MODEL_35B,
|
1600
1692
|
MODEL_40B,
|
1601
1693
|
MODEL_65B,
|
1602
1694
|
MODEL_70B,
|
1695
|
+
MODEL_314B,
|
1603
1696
|
MODEL_SMALL,
|
1604
1697
|
MODEL_MEDIUM,
|
1605
1698
|
MODEL_LARGE,
|
@@ -1643,6 +1736,7 @@ struct llama_hparams {
|
|
1643
1736
|
|
1644
1737
|
float f_clamp_kqv = 0.0f;
|
1645
1738
|
float f_max_alibi_bias = 0.0f;
|
1739
|
+
float f_logit_scale = 0.0f;
|
1646
1740
|
|
1647
1741
|
bool causal_attn = true;
|
1648
1742
|
bool need_kq_pos = false;
|
@@ -1716,6 +1810,7 @@ struct llama_cparams {
|
|
1716
1810
|
uint32_t n_ctx; // context size used during inference
|
1717
1811
|
uint32_t n_batch;
|
1718
1812
|
uint32_t n_ubatch;
|
1813
|
+
uint32_t n_seq_max;
|
1719
1814
|
uint32_t n_threads; // number of threads to use for generation
|
1720
1815
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1721
1816
|
|
@@ -1781,9 +1876,9 @@ struct llama_layer {
|
|
1781
1876
|
|
1782
1877
|
// ff MoE
|
1783
1878
|
struct ggml_tensor * ffn_gate_inp;
|
1784
|
-
struct ggml_tensor *
|
1785
|
-
struct ggml_tensor *
|
1786
|
-
struct ggml_tensor *
|
1879
|
+
struct ggml_tensor * ffn_gate_exps;
|
1880
|
+
struct ggml_tensor * ffn_down_exps;
|
1881
|
+
struct ggml_tensor * ffn_up_exps ;
|
1787
1882
|
|
1788
1883
|
// ff bias
|
1789
1884
|
struct ggml_tensor * ffn_down_b; // b2
|
@@ -1873,6 +1968,31 @@ struct llama_kv_cache {
|
|
1873
1968
|
}
|
1874
1969
|
};
|
1875
1970
|
|
1971
|
+
struct llama_control_vector {
|
1972
|
+
std::vector<struct ggml_tensor *> tensors; // per layer
|
1973
|
+
std::vector<struct ggml_context *> ctxs;
|
1974
|
+
std::vector<ggml_backend_buffer_t> bufs;
|
1975
|
+
|
1976
|
+
int32_t layer_start = -1;
|
1977
|
+
int32_t layer_end = -1;
|
1978
|
+
|
1979
|
+
ggml_tensor * tensor_for(int il) const {
|
1980
|
+
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
1981
|
+
return nullptr;
|
1982
|
+
}
|
1983
|
+
return tensors[il];
|
1984
|
+
}
|
1985
|
+
|
1986
|
+
~llama_control_vector() {
|
1987
|
+
for (struct ggml_context * ctx : ctxs) {
|
1988
|
+
ggml_free(ctx);
|
1989
|
+
}
|
1990
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
1991
|
+
ggml_backend_buffer_free(buf);
|
1992
|
+
}
|
1993
|
+
}
|
1994
|
+
};
|
1995
|
+
|
1876
1996
|
struct llama_vocab {
|
1877
1997
|
using id = int32_t;
|
1878
1998
|
using token = std::string;
|
@@ -1976,12 +2096,12 @@ struct llama_model {
|
|
1976
2096
|
// the model memory buffers for the tensor data
|
1977
2097
|
std::vector<ggml_backend_buffer_t> bufs;
|
1978
2098
|
|
1979
|
-
// model memory mapped
|
1980
|
-
|
2099
|
+
// model memory mapped files
|
2100
|
+
llama_mmaps mappings;
|
1981
2101
|
|
1982
2102
|
// objects representing data potentially being locked in memory
|
1983
|
-
|
1984
|
-
|
2103
|
+
llama_mlocks mlock_bufs;
|
2104
|
+
llama_mlocks mlock_mmaps;
|
1985
2105
|
|
1986
2106
|
// for quantize-stats only
|
1987
2107
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
@@ -1994,6 +2114,11 @@ struct llama_model {
|
|
1994
2114
|
ggml_free(ctx);
|
1995
2115
|
}
|
1996
2116
|
for (ggml_backend_buffer_t buf : bufs) {
|
2117
|
+
#ifdef GGML_USE_CUDA
|
2118
|
+
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
2119
|
+
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
2120
|
+
}
|
2121
|
+
#endif
|
1997
2122
|
ggml_backend_buffer_free(buf);
|
1998
2123
|
}
|
1999
2124
|
}
|
@@ -2008,10 +2133,6 @@ struct llama_context {
|
|
2008
2133
|
ggml_backend_free(backend);
|
2009
2134
|
}
|
2010
2135
|
|
2011
|
-
#ifdef GGML_USE_VULKAN
|
2012
|
-
ggml_vk_free_cpu_assist();
|
2013
|
-
#endif
|
2014
|
-
|
2015
2136
|
ggml_backend_buffer_free(buf_output);
|
2016
2137
|
}
|
2017
2138
|
|
@@ -2048,20 +2169,20 @@ struct llama_context {
|
|
2048
2169
|
// host buffer for the model output (logits and embeddings)
|
2049
2170
|
ggml_backend_buffer_t buf_output = nullptr;
|
2050
2171
|
|
2051
|
-
// decode output (2-dimensional array: [
|
2052
|
-
size_t
|
2053
|
-
float * logits
|
2172
|
+
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
2173
|
+
size_t logits_size = 0; // capacity (of floats) for logits
|
2174
|
+
float * logits = nullptr;
|
2175
|
+
|
2176
|
+
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
|
+
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2054
2179
|
|
2055
|
-
#ifndef NDEBUG
|
2056
|
-
// guard against access to unset logits
|
2057
|
-
std::vector<bool> logits_valid;
|
2058
|
-
#endif
|
2059
2180
|
bool logits_all = false;
|
2060
2181
|
|
2061
|
-
// embeddings output (2-dimensional array: [
|
2182
|
+
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
2062
2183
|
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
2063
|
-
size_t
|
2064
|
-
float * embd
|
2184
|
+
size_t embd_size = 0; // capacity (of floats) for embeddings
|
2185
|
+
float * embd = nullptr;
|
2065
2186
|
|
2066
2187
|
// sequence embeddings output (map of [n_embd] vectors)
|
2067
2188
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
@@ -2078,14 +2199,18 @@ struct llama_context {
|
|
2078
2199
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
2079
2200
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
2080
2201
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2202
|
+
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2081
2203
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2082
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [
|
2204
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2083
2205
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2084
2206
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2085
2207
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
2086
2208
|
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
2087
|
-
struct ggml_tensor * inp_s_mask; // F32 [1,
|
2088
|
-
struct ggml_tensor * inp_s_seq; // I32 [
|
2209
|
+
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
2210
|
+
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
2211
|
+
|
2212
|
+
// control vectors
|
2213
|
+
struct llama_control_vector cvec;
|
2089
2214
|
|
2090
2215
|
#ifdef GGML_USE_MPI
|
2091
2216
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2737,6 +2862,8 @@ namespace GGUFMeta {
|
|
2737
2862
|
};
|
2738
2863
|
}
|
2739
2864
|
|
2865
|
+
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
2866
|
+
|
2740
2867
|
struct llama_model_loader {
|
2741
2868
|
int n_kv = 0;
|
2742
2869
|
int n_tensors = 0;
|
@@ -2747,54 +2874,133 @@ struct llama_model_loader {
|
|
2747
2874
|
|
2748
2875
|
bool use_mmap = false;
|
2749
2876
|
|
2750
|
-
|
2877
|
+
llama_files files;
|
2751
2878
|
llama_ftype ftype;
|
2752
2879
|
llama_fver fver;
|
2753
2880
|
|
2754
|
-
|
2881
|
+
llama_mmaps mappings;
|
2882
|
+
|
2883
|
+
// Holds information on a model weight
|
2884
|
+
struct llama_tensor_weight {
|
2885
|
+
uint16_t idx; // source file index
|
2886
|
+
size_t offs; // tensor data offset in the original file
|
2887
|
+
|
2888
|
+
ggml_tensor * tensor;
|
2889
|
+
|
2890
|
+
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2891
|
+
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2892
|
+
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
2893
|
+
}
|
2894
|
+
};
|
2895
|
+
std::vector<llama_tensor_weight> weights;
|
2896
|
+
|
2755
2897
|
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
2756
2898
|
|
2757
|
-
struct gguf_context *
|
2758
|
-
|
2899
|
+
struct gguf_context * meta = NULL;
|
2900
|
+
std::vector<ggml_context *> contexts;
|
2759
2901
|
|
2760
2902
|
std::string arch_name;
|
2761
2903
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2762
2904
|
|
2763
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p)
|
2905
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
2764
2906
|
int trace = 0;
|
2765
2907
|
if (getenv("LLAMA_TRACE")) {
|
2766
2908
|
trace = atoi(getenv("LLAMA_TRACE"));
|
2767
2909
|
}
|
2768
2910
|
|
2769
|
-
struct gguf_init_params params = {
|
2770
|
-
/*.no_alloc = */ true,
|
2771
|
-
/*.ctx = */ &ctx_meta,
|
2772
|
-
};
|
2773
|
-
|
2774
2911
|
if (param_overrides_p != nullptr) {
|
2775
2912
|
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
2776
2913
|
kv_overrides.insert({std::string(p->key), *p});
|
2777
2914
|
}
|
2778
2915
|
}
|
2779
2916
|
|
2780
|
-
|
2781
|
-
|
2917
|
+
struct ggml_context * ctx = NULL;
|
2918
|
+
struct gguf_init_params params = {
|
2919
|
+
/*.no_alloc = */ true,
|
2920
|
+
/*.ctx = */ &ctx,
|
2921
|
+
};
|
2922
|
+
|
2923
|
+
meta = gguf_init_from_file(fname.c_str(), params);
|
2924
|
+
if (!meta) {
|
2782
2925
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
2783
2926
|
}
|
2784
2927
|
|
2785
2928
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2786
2929
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2787
2930
|
|
2788
|
-
|
2789
|
-
|
2931
|
+
// Save tensors data offset of the main file.
|
2932
|
+
// For subsidiary files, `meta` tensor data offset must not be used,
|
2933
|
+
// so we build a unified tensors index for weights.
|
2934
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2935
|
+
weights.emplace_back(0, cur->name, meta, cur);
|
2936
|
+
}
|
2937
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
2938
|
+
contexts.emplace_back(ctx);
|
2939
|
+
|
2940
|
+
uint16_t n_split = 0;
|
2941
|
+
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
2942
|
+
|
2943
|
+
// Load additional GGML contexts
|
2944
|
+
if (n_split > 1) {
|
2945
|
+
uint16_t idx = 0;
|
2946
|
+
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
2947
|
+
if (idx != 0) {
|
2948
|
+
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
2949
|
+
}
|
2950
|
+
|
2951
|
+
char split_prefix[PATH_MAX] = {0};
|
2952
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
|
2953
|
+
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
2954
|
+
}
|
2955
|
+
|
2956
|
+
if (trace > 0) {
|
2957
|
+
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
2958
|
+
}
|
2959
|
+
|
2960
|
+
char split_path[PATH_MAX] = {0};
|
2961
|
+
for (idx = 1; idx < n_split; idx++) {
|
2962
|
+
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
2963
|
+
|
2964
|
+
struct gguf_init_params split_params = {
|
2965
|
+
/*.no_alloc = */ true,
|
2966
|
+
/*.ctx = */ &ctx,
|
2967
|
+
};
|
2968
|
+
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
|
2969
|
+
if (!ctx_gguf) {
|
2970
|
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
2971
|
+
}
|
2972
|
+
|
2973
|
+
// Save tensors data offset info of the shard.
|
2974
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2975
|
+
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
2976
|
+
}
|
2977
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
2978
|
+
contexts.emplace_back(ctx);
|
2979
|
+
|
2980
|
+
gguf_free(ctx_gguf);
|
2981
|
+
}
|
2982
|
+
|
2983
|
+
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
2984
|
+
|
2985
|
+
// sanity check
|
2986
|
+
{
|
2987
|
+
const int n_tensors_loaded = (int) weights.size();
|
2988
|
+
if (n_tensors != n_tensors_loaded) {
|
2989
|
+
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
2990
|
+
}
|
2991
|
+
}
|
2790
2992
|
|
2791
|
-
|
2993
|
+
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
2994
|
+
}
|
2995
|
+
|
2996
|
+
n_kv = gguf_get_n_kv(meta);
|
2997
|
+
n_tensors = weights.size();
|
2792
2998
|
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
n_elements += ggml_nelements(
|
2797
|
-
n_bytes += ggml_nbytes(
|
2999
|
+
fver = (enum llama_fver) gguf_get_version(meta);
|
3000
|
+
|
3001
|
+
for (auto & w : weights) {
|
3002
|
+
n_elements += ggml_nelements(w.tensor);
|
3003
|
+
n_bytes += ggml_nbytes(w.tensor);
|
2798
3004
|
}
|
2799
3005
|
|
2800
3006
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -2809,7 +3015,8 @@ struct llama_model_loader {
|
|
2809
3015
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2810
3016
|
|
2811
3017
|
for (int i = 0; i < n_tensors; i++) {
|
2812
|
-
|
3018
|
+
const ggml_tensor * tensor = weights.at(i).tensor;
|
3019
|
+
enum ggml_type type = tensor->type;
|
2813
3020
|
|
2814
3021
|
n_type[type]++;
|
2815
3022
|
|
@@ -2819,8 +3026,8 @@ struct llama_model_loader {
|
|
2819
3026
|
}
|
2820
3027
|
|
2821
3028
|
if (trace > 0) {
|
2822
|
-
|
2823
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(
|
3029
|
+
const uint16_t sid = weights.at(i).idx;
|
3030
|
+
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
2824
3031
|
}
|
2825
3032
|
}
|
2826
3033
|
|
@@ -2842,6 +3049,7 @@ struct llama_model_loader {
|
|
2842
3049
|
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2843
3050
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2844
3051
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
3052
|
+
case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
|
2845
3053
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2846
3054
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2847
3055
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
@@ -2856,22 +3064,23 @@ struct llama_model_loader {
|
|
2856
3064
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
2857
3065
|
|
2858
3066
|
{
|
2859
|
-
const int kid = gguf_find_key(
|
3067
|
+
const int kid = gguf_find_key(meta, "general.file_type");
|
2860
3068
|
if (kid >= 0) {
|
2861
|
-
ftype = (llama_ftype) gguf_get_val_u32(
|
3069
|
+
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
2862
3070
|
}
|
2863
3071
|
}
|
2864
3072
|
|
2865
3073
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
3074
|
+
|
2866
3075
|
for (int i = 0; i < n_kv; i++) {
|
2867
|
-
const char * name = gguf_get_key(
|
2868
|
-
const enum gguf_type type = gguf_get_kv_type(
|
3076
|
+
const char * name = gguf_get_key(meta, i);
|
3077
|
+
const enum gguf_type type = gguf_get_kv_type(meta, i);
|
2869
3078
|
const std::string type_name =
|
2870
3079
|
type == GGUF_TYPE_ARRAY
|
2871
|
-
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(
|
3080
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
|
2872
3081
|
: gguf_type_name(type);
|
2873
3082
|
|
2874
|
-
std::string value = gguf_kv_to_str(
|
3083
|
+
std::string value = gguf_kv_to_str(meta, i);
|
2875
3084
|
const size_t MAX_VALUE_LEN = 40;
|
2876
3085
|
if (value.size() > MAX_VALUE_LEN) {
|
2877
3086
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
@@ -2900,18 +3109,18 @@ struct llama_model_loader {
|
|
2900
3109
|
}
|
2901
3110
|
|
2902
3111
|
~llama_model_loader() {
|
2903
|
-
if (
|
2904
|
-
gguf_free(
|
3112
|
+
if (meta) {
|
3113
|
+
gguf_free(meta);
|
2905
3114
|
}
|
2906
|
-
|
2907
|
-
ggml_free(
|
3115
|
+
for (auto * ctx : contexts) {
|
3116
|
+
ggml_free(ctx);
|
2908
3117
|
}
|
2909
3118
|
}
|
2910
3119
|
|
2911
3120
|
template<typename T>
|
2912
3121
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2913
3122
|
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2914
|
-
const int kid = gguf_find_key(
|
3123
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
2915
3124
|
|
2916
3125
|
if (kid < 0) {
|
2917
3126
|
if (required) {
|
@@ -2921,7 +3130,7 @@ struct llama_model_loader {
|
|
2921
3130
|
}
|
2922
3131
|
|
2923
3132
|
struct GGUFMeta::ArrayInfo arr_info =
|
2924
|
-
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(
|
3133
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
2925
3134
|
|
2926
3135
|
|
2927
3136
|
result = arr_info.length;
|
@@ -2941,7 +3150,7 @@ struct llama_model_loader {
|
|
2941
3150
|
const struct llama_model_kv_override * override =
|
2942
3151
|
it != kv_overrides.end() ? &it->second : nullptr;
|
2943
3152
|
|
2944
|
-
const bool found = GGUFMeta::GKV<T>::set(
|
3153
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
2945
3154
|
|
2946
3155
|
if (required && !found) {
|
2947
3156
|
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
@@ -2964,28 +3173,57 @@ struct llama_model_loader {
|
|
2964
3173
|
}
|
2965
3174
|
|
2966
3175
|
const char * get_tensor_name(int i) const {
|
2967
|
-
return
|
3176
|
+
return weights.at(i).tensor->name;
|
3177
|
+
}
|
3178
|
+
|
3179
|
+
const llama_tensor_weight * get_weight(const char * name) const {
|
3180
|
+
for (const auto & weight : weights) {
|
3181
|
+
if (strcmp(name, weight.tensor->name) == 0) {
|
3182
|
+
return &weight;
|
3183
|
+
}
|
3184
|
+
}
|
3185
|
+
return nullptr;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
const llama_tensor_weight & require_weight(const char * name) const {
|
3189
|
+
const llama_tensor_weight * weight = get_weight(name);
|
3190
|
+
if (!weight) {
|
3191
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3192
|
+
}
|
3193
|
+
return *weight;
|
2968
3194
|
}
|
2969
3195
|
|
2970
3196
|
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
2971
|
-
|
3197
|
+
const auto * weight = get_weight(name);
|
3198
|
+
if (!weight) {
|
3199
|
+
return nullptr;
|
3200
|
+
}
|
3201
|
+
return weight->tensor;
|
3202
|
+
}
|
3203
|
+
|
3204
|
+
struct ggml_tensor * require_tensor_meta(const char * name) const {
|
3205
|
+
struct ggml_tensor * tensor = get_tensor_meta(name);
|
3206
|
+
if (!tensor) {
|
3207
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3208
|
+
}
|
3209
|
+
return tensor;
|
2972
3210
|
}
|
2973
3211
|
|
2974
3212
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
2975
3213
|
return get_tensor_meta(get_tensor_name(i));
|
2976
3214
|
}
|
2977
3215
|
|
2978
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor *
|
2979
|
-
struct ggml_tensor * tensor = ggml_dup_tensor(ctx,
|
2980
|
-
ggml_set_name(tensor, ggml_get_name(
|
3216
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3217
|
+
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3218
|
+
ggml_set_name(tensor, ggml_get_name(cur));
|
2981
3219
|
|
2982
3220
|
n_created++;
|
2983
3221
|
|
2984
3222
|
return tensor;
|
2985
3223
|
}
|
2986
3224
|
|
2987
|
-
struct ggml_tensor *
|
2988
|
-
struct ggml_tensor * cur =
|
3225
|
+
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
|
3226
|
+
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
2989
3227
|
|
2990
3228
|
if (cur == NULL) {
|
2991
3229
|
if (!required) {
|
@@ -2996,8 +3234,8 @@ struct llama_model_loader {
|
|
2996
3234
|
|
2997
3235
|
{
|
2998
3236
|
bool is_ok = true;
|
2999
|
-
for (size_t i = 0; i <
|
3000
|
-
if (ne[i] != cur->ne[i]) {
|
3237
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3238
|
+
if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
|
3001
3239
|
is_ok = false;
|
3002
3240
|
break;
|
3003
3241
|
}
|
@@ -3011,127 +3249,196 @@ struct llama_model_loader {
|
|
3011
3249
|
}
|
3012
3250
|
}
|
3013
3251
|
|
3014
|
-
return
|
3252
|
+
return cur;
|
3015
3253
|
}
|
3016
3254
|
|
3017
|
-
|
3018
|
-
|
3019
|
-
|
3255
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
3256
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3257
|
+
|
3258
|
+
if (cur == NULL) {
|
3259
|
+
return NULL;
|
3020
3260
|
}
|
3261
|
+
|
3262
|
+
return create_tensor_for(ctx, cur);
|
3021
3263
|
}
|
3022
3264
|
|
3023
|
-
|
3024
|
-
const
|
3265
|
+
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
3266
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3025
3267
|
|
3026
|
-
if (
|
3027
|
-
|
3268
|
+
if (cur == NULL) {
|
3269
|
+
return NULL;
|
3028
3270
|
}
|
3029
3271
|
|
3030
|
-
|
3031
|
-
|
3272
|
+
if (cur->type != base->type) {
|
3273
|
+
throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
|
3274
|
+
}
|
3032
3275
|
|
3033
|
-
|
3034
|
-
|
3035
|
-
|
3036
|
-
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
3276
|
+
std::array<int64_t, GGML_MAX_DIMS> dims;
|
3277
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3278
|
+
dims[i] = i < ne.size() ? ne[i] : 1;
|
3037
3279
|
}
|
3038
3280
|
|
3039
|
-
|
3040
|
-
|
3041
|
-
|
3042
|
-
|
3281
|
+
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
3282
|
+
dims[0], dims[1], dims[2], dims[3],
|
3283
|
+
cur->nb[1], cur->nb[2], cur->nb[3],
|
3284
|
+
offset);
|
3285
|
+
|
3286
|
+
ggml_set_name(tensor, name.c_str());
|
3287
|
+
|
3288
|
+
n_created++;
|
3289
|
+
|
3290
|
+
return tensor;
|
3291
|
+
}
|
3292
|
+
|
3293
|
+
void done_getting_tensors() const {
|
3294
|
+
if (n_created != n_tensors) {
|
3295
|
+
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
3043
3296
|
}
|
3297
|
+
}
|
3044
3298
|
|
3045
|
-
|
3046
|
-
|
3047
|
-
|
3299
|
+
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
|
3300
|
+
if (use_mmap) {
|
3301
|
+
mappings.reserve(files.size());
|
3302
|
+
mmaps_used.reserve(files.size());
|
3303
|
+
for (const auto & file : files) {
|
3304
|
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
3305
|
+
mmaps_used.emplace_back(mapping->size, 0);
|
3306
|
+
if (mlock_mmaps) {
|
3307
|
+
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
3308
|
+
mlock_mmap->init(mapping->addr);
|
3309
|
+
mlock_mmaps->emplace_back(std::move(mlock_mmap));
|
3310
|
+
}
|
3311
|
+
mappings.emplace_back(std::move(mapping));
|
3048
3312
|
}
|
3049
|
-
|
3313
|
+
}
|
3314
|
+
|
3315
|
+
// compute the total size of all tensors for progress reporting
|
3316
|
+
for (auto & w : weights) {
|
3317
|
+
size_data += ggml_nbytes(w.tensor);
|
3050
3318
|
}
|
3051
3319
|
}
|
3052
3320
|
|
3053
|
-
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
3054
|
-
GGML_ASSERT(
|
3321
|
+
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
|
3322
|
+
GGML_ASSERT(!mappings.empty());
|
3323
|
+
const auto & mapping = mappings.at(idx);
|
3055
3324
|
|
3056
3325
|
*first = mapping->size;
|
3057
3326
|
*last = 0;
|
3327
|
+
*addr = mapping->addr;
|
3058
3328
|
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
3059
|
-
|
3060
|
-
|
3061
|
-
|
3329
|
+
try {
|
3330
|
+
const auto * weight = get_weight(ggml_get_name(tensor));
|
3331
|
+
if (!weight) {
|
3332
|
+
continue;
|
3333
|
+
}
|
3334
|
+
if (weight->idx != idx) {
|
3335
|
+
continue;
|
3336
|
+
}
|
3337
|
+
*first = std::min(*first, weight->offs);
|
3338
|
+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
3339
|
+
} catch(...) {
|
3340
|
+
// the tensor is not in the model
|
3341
|
+
}
|
3062
3342
|
}
|
3063
3343
|
}
|
3064
3344
|
|
3065
3345
|
// for backwards compatibility, does not support ggml-backend
|
3066
3346
|
void load_data_for(struct ggml_tensor * cur) const {
|
3067
|
-
const
|
3347
|
+
const auto & w = require_weight(ggml_get_name(cur));
|
3068
3348
|
|
3069
|
-
if (use_mmap
|
3349
|
+
if (use_mmap) {
|
3350
|
+
const auto & mapping = mappings.at(w.idx);
|
3070
3351
|
if (cur->data == nullptr) {
|
3071
|
-
cur->data = (uint8_t *)mapping->addr + offs;
|
3352
|
+
cur->data = (uint8_t *)mapping->addr + w.offs;
|
3072
3353
|
} else {
|
3073
|
-
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
3354
|
+
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
3074
3355
|
}
|
3075
3356
|
} else {
|
3076
3357
|
GGML_ASSERT(cur->data != nullptr);
|
3077
|
-
|
3078
|
-
file.
|
3358
|
+
GGML_ASSERT(w.idx < files.size());
|
3359
|
+
const auto & file = files.at(w.idx);
|
3360
|
+
file->seek(w.offs, SEEK_SET);
|
3361
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3079
3362
|
}
|
3080
3363
|
}
|
3081
3364
|
|
3082
3365
|
size_t size_done = 0;
|
3083
3366
|
size_t size_data = 0;
|
3084
|
-
size_t
|
3085
|
-
size_t mmap_used_last = 0;
|
3367
|
+
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
3086
3368
|
|
3087
3369
|
// Returns false if cancelled by progress_callback
|
3088
|
-
bool load_all_data(
|
3089
|
-
|
3370
|
+
bool load_all_data(
|
3371
|
+
struct ggml_context * ctx,
|
3372
|
+
llama_buf_map & bufs_mmap,
|
3373
|
+
llama_mlocks * lmlocks,
|
3374
|
+
llama_progress_callback progress_callback,
|
3375
|
+
void * progress_callback_user_data) {
|
3376
|
+
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3090
3377
|
|
3091
3378
|
std::vector<no_init<uint8_t>> read_buf;
|
3092
|
-
|
3093
3379
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3380
|
+
const auto * weight = get_weight(ggml_get_name(cur));
|
3381
|
+
if (weight == nullptr) {
|
3382
|
+
// this can happen with split experts models
|
3383
|
+
continue;
|
3384
|
+
}
|
3385
|
+
|
3094
3386
|
if (progress_callback) {
|
3095
3387
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
3096
3388
|
return false;
|
3097
3389
|
}
|
3098
3390
|
}
|
3099
3391
|
|
3100
|
-
|
3392
|
+
size_t n_size = ggml_nbytes(cur);
|
3101
3393
|
|
3102
|
-
if (use_mmap
|
3394
|
+
if (use_mmap) {
|
3395
|
+
const auto & mapping = mappings.at(weight->idx);
|
3396
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3397
|
+
if (bufs_mmap.count(weight->idx)) {
|
3398
|
+
buf_mmap = bufs_mmap.at(weight->idx);
|
3399
|
+
}
|
3400
|
+
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3103
3401
|
if (buf_mmap && cur->data == nullptr) {
|
3104
|
-
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
3105
|
-
if (
|
3106
|
-
lmlock
|
3402
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
|
3403
|
+
if (lmlocks) {
|
3404
|
+
const auto & lmlock = lmlocks->at(weight->idx);
|
3405
|
+
lmlock->grow_to(weight->offs + ggml_nbytes(cur));
|
3107
3406
|
}
|
3108
|
-
|
3109
|
-
|
3407
|
+
|
3408
|
+
auto & mmap_used = mmaps_used[weight->idx];
|
3409
|
+
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3410
|
+
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3110
3411
|
} else {
|
3111
|
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0,
|
3412
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
|
3112
3413
|
}
|
3113
3414
|
} else {
|
3415
|
+
GGML_ASSERT(weight->idx < files.size());
|
3416
|
+
const auto & file = files.at(weight->idx);
|
3114
3417
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3115
|
-
file
|
3116
|
-
file
|
3418
|
+
file->seek(weight->offs, SEEK_SET);
|
3419
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3117
3420
|
} else {
|
3118
3421
|
read_buf.resize(ggml_nbytes(cur));
|
3119
|
-
file
|
3120
|
-
file
|
3121
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0,
|
3422
|
+
file->seek(weight->offs, SEEK_SET);
|
3423
|
+
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
3424
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3122
3425
|
}
|
3123
3426
|
}
|
3124
3427
|
|
3125
|
-
size_done +=
|
3428
|
+
size_done += n_size;
|
3126
3429
|
}
|
3127
3430
|
|
3128
3431
|
// check if this is the last call and do final cleanup
|
3129
3432
|
if (size_done >= size_data) {
|
3130
3433
|
// unmap offloaded tensors and metadata
|
3131
|
-
if (use_mmap
|
3132
|
-
|
3133
|
-
|
3134
|
-
mapping
|
3434
|
+
if (use_mmap) {
|
3435
|
+
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
|
3436
|
+
const auto & mmap_used = mmaps_used.at(idx);
|
3437
|
+
auto & mapping = mappings.at(idx);
|
3438
|
+
mapping->unmap_fragment(0, mmap_used.first);
|
3439
|
+
if (mmap_used.second != 0) {
|
3440
|
+
mapping->unmap_fragment(mmap_used.second, mapping->size);
|
3441
|
+
}
|
3135
3442
|
}
|
3136
3443
|
}
|
3137
3444
|
if (progress_callback) {
|
@@ -3204,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3204
3511
|
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
3205
3512
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
3206
3513
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
3514
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
|
3207
3515
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
3208
3516
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
3209
3517
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
@@ -3231,9 +3539,11 @@ static const char * llama_model_type_name(e_model type) {
|
|
3231
3539
|
case MODEL_20B: return "20B";
|
3232
3540
|
case MODEL_30B: return "30B";
|
3233
3541
|
case MODEL_34B: return "34B";
|
3542
|
+
case MODEL_35B: return "35B";
|
3234
3543
|
case MODEL_40B: return "40B";
|
3235
3544
|
case MODEL_65B: return "65B";
|
3236
3545
|
case MODEL_70B: return "70B";
|
3546
|
+
case MODEL_314B: return "314B";
|
3237
3547
|
case MODEL_SMALL: return "0.1B";
|
3238
3548
|
case MODEL_MEDIUM: return "0.4B";
|
3239
3549
|
case MODEL_LARGE: return "0.8B";
|
@@ -3263,7 +3573,7 @@ static void llm_load_hparams(
|
|
3263
3573
|
llama_model_loader & ml,
|
3264
3574
|
llama_model & model) {
|
3265
3575
|
auto & hparams = model.hparams;
|
3266
|
-
const gguf_context * ctx = ml.
|
3576
|
+
const gguf_context * ctx = ml.meta;
|
3267
3577
|
|
3268
3578
|
// get metadata as string
|
3269
3579
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -3372,6 +3682,15 @@ static void llm_load_hparams(
|
|
3372
3682
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3373
3683
|
}
|
3374
3684
|
} break;
|
3685
|
+
case LLM_ARCH_GROK:
|
3686
|
+
{
|
3687
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3688
|
+
|
3689
|
+
switch (hparams.n_layer) {
|
3690
|
+
case 64: model.type = e_model::MODEL_314B; break;
|
3691
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3692
|
+
}
|
3693
|
+
} break;
|
3375
3694
|
case LLM_ARCH_FALCON:
|
3376
3695
|
{
|
3377
3696
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3623,6 +3942,25 @@ static void llm_load_hparams(
|
|
3623
3942
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3624
3943
|
}
|
3625
3944
|
} break;
|
3945
|
+
case LLM_ARCH_XVERSE:
|
3946
|
+
{
|
3947
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3948
|
+
switch (hparams.n_layer) {
|
3949
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3950
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3951
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
3952
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3953
|
+
}
|
3954
|
+
} break;
|
3955
|
+
case LLM_ARCH_COMMAND_R:
|
3956
|
+
{
|
3957
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
3958
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3959
|
+
switch (hparams.n_layer) {
|
3960
|
+
case 40: model.type = e_model::MODEL_35B; break;
|
3961
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3962
|
+
}
|
3963
|
+
} break;
|
3626
3964
|
default: (void)0;
|
3627
3965
|
}
|
3628
3966
|
|
@@ -3644,7 +3982,7 @@ static void llm_load_vocab(
|
|
3644
3982
|
llama_model & model) {
|
3645
3983
|
auto & vocab = model.vocab;
|
3646
3984
|
|
3647
|
-
struct gguf_context * ctx = ml.
|
3985
|
+
struct gguf_context * ctx = ml.meta;
|
3648
3986
|
|
3649
3987
|
const auto kv = LLM_KV(model.arch);
|
3650
3988
|
|
@@ -3777,7 +4115,7 @@ static void llm_load_vocab(
|
|
3777
4115
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3778
4116
|
vocab.linefeed_id = vocab.special_pad_id;
|
3779
4117
|
} else {
|
3780
|
-
const std::vector<int> ids = llama_tokenize_internal(vocab, "\
|
4118
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
3781
4119
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
3782
4120
|
vocab.linefeed_id = ids[0];
|
3783
4121
|
}
|
@@ -3944,6 +4282,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3944
4282
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
3945
4283
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
3946
4284
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
4285
|
+
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
3947
4286
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3948
4287
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3949
4288
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
@@ -4009,6 +4348,7 @@ static bool llm_load_tensors(
|
|
4009
4348
|
|
4010
4349
|
const int64_t n_layer = hparams.n_layer;
|
4011
4350
|
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
4351
|
+
bool use_mmap_buffer = true;
|
4012
4352
|
|
4013
4353
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
4014
4354
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
@@ -4097,6 +4437,10 @@ static bool llm_load_tensors(
|
|
4097
4437
|
|
4098
4438
|
// create one context per buffer type
|
4099
4439
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4440
|
+
|
4441
|
+
// for moe merged tensors
|
4442
|
+
ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
|
4443
|
+
|
4100
4444
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4101
4445
|
for (auto & it : buft_layer_count) {
|
4102
4446
|
struct ggml_init_params params = {
|
@@ -4123,6 +4467,11 @@ static bool llm_load_tensors(
|
|
4123
4467
|
const int64_t n_vocab = hparams.n_vocab;
|
4124
4468
|
const int64_t n_vocab_type = hparams.n_vocab_type;
|
4125
4469
|
const int64_t n_ff = hparams.n_ff;
|
4470
|
+
const int64_t n_expert = hparams.n_expert;
|
4471
|
+
|
4472
|
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
4473
|
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
4474
|
+
}
|
4126
4475
|
|
4127
4476
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
4128
4477
|
|
@@ -4177,26 +4526,113 @@ static bool llm_load_tensors(
|
|
4177
4526
|
|
4178
4527
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4179
4528
|
|
4180
|
-
|
4181
|
-
|
4182
|
-
if (layer.ffn_gate_inp == nullptr) {
|
4183
|
-
GGML_ASSERT(hparams.n_expert == 0);
|
4184
|
-
GGML_ASSERT(hparams.n_expert_used == 0);
|
4185
|
-
|
4529
|
+
if (n_expert == 0) {
|
4186
4530
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4187
4531
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4188
4532
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4189
4533
|
} else {
|
4190
|
-
|
4191
|
-
|
4192
|
-
|
4193
|
-
|
4194
|
-
|
4195
|
-
layer.
|
4196
|
-
|
4197
|
-
|
4534
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4535
|
+
|
4536
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4537
|
+
if (layer.ffn_gate_exps) {
|
4538
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4539
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4540
|
+
} else {
|
4541
|
+
// merge split expert into a single tensor for compatibility with older models
|
4542
|
+
// requires disabling mmap
|
4543
|
+
use_mmap_buffer = false;
|
4544
|
+
|
4545
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4546
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4547
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4548
|
+
|
4549
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4550
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4551
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4552
|
+
|
4553
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4554
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4555
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4556
|
+
|
4557
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4558
|
+
// the individual experts are loaded into a view of the merged tensor
|
4559
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4560
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4561
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4562
|
+
}
|
4563
|
+
}
|
4564
|
+
}
|
4565
|
+
}
|
4566
|
+
} break;
|
4567
|
+
case LLM_ARCH_GROK:
|
4568
|
+
{
|
4569
|
+
if (n_expert == 0) {
|
4570
|
+
throw std::runtime_error("Grok model cannot have zero experts");
|
4571
|
+
}
|
4572
|
+
|
4573
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4574
|
+
|
4575
|
+
// output
|
4576
|
+
{
|
4577
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4578
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4579
|
+
// if output is NULL, init from the input tok embed
|
4580
|
+
if (model.output == NULL) {
|
4581
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4582
|
+
ml.n_created--; // artificial tensor
|
4583
|
+
ml.size_data += ggml_nbytes(model.output);
|
4584
|
+
}
|
4585
|
+
}
|
4586
|
+
|
4587
|
+
for (int i = 0; i < n_layer; ++i) {
|
4588
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4589
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4590
|
+
|
4591
|
+
auto & layer = model.layers[i];
|
4592
|
+
|
4593
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4594
|
+
|
4595
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4596
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4597
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4598
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4599
|
+
|
4600
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4601
|
+
|
4602
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4603
|
+
|
4604
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4605
|
+
|
4606
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4607
|
+
if (layer.ffn_gate_exps) {
|
4608
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4609
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4610
|
+
} else {
|
4611
|
+
// merge split expert into a single tensor for compatibility with older models
|
4612
|
+
// requires disabling mmap
|
4613
|
+
use_mmap_buffer = false;
|
4614
|
+
|
4615
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4616
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4617
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4618
|
+
|
4619
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4620
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4621
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4622
|
+
|
4623
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4624
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4625
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4626
|
+
|
4627
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4628
|
+
// the individual experts are loaded into a view of the merged tensor
|
4629
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4630
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4631
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4198
4632
|
}
|
4199
4633
|
}
|
4634
|
+
|
4635
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4200
4636
|
}
|
4201
4637
|
} break;
|
4202
4638
|
case LLM_ARCH_BAICHUAN:
|
@@ -4235,9 +4671,9 @@ static bool llm_load_tensors(
|
|
4235
4671
|
{
|
4236
4672
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4237
4673
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4674
|
+
|
4675
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4676
|
+
if (!model.output) {
|
4241
4677
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
4242
4678
|
ml.n_created--; // artificial tensor
|
4243
4679
|
ml.size_data += ggml_nbytes(model.output);
|
@@ -4253,10 +4689,8 @@ static bool llm_load_tensors(
|
|
4253
4689
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4254
4690
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4255
4691
|
|
4256
|
-
|
4257
|
-
|
4258
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
4259
|
-
}
|
4692
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
|
4693
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
|
4260
4694
|
|
4261
4695
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4262
4696
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -4436,16 +4870,19 @@ static bool llm_load_tensors(
|
|
4436
4870
|
case LLM_ARCH_MPT:
|
4437
4871
|
{
|
4438
4872
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4873
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
|
4439
4874
|
|
4440
4875
|
// output
|
4441
4876
|
{
|
4442
4877
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4443
4878
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
4444
4879
|
|
4445
|
-
|
4446
|
-
model.output
|
4447
|
-
|
4448
|
-
|
4880
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4881
|
+
if (!model.output) {
|
4882
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
4883
|
+
ml.n_created--; // artificial tensor
|
4884
|
+
ml.size_data += ggml_nbytes(model.output);
|
4885
|
+
}
|
4449
4886
|
}
|
4450
4887
|
|
4451
4888
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -4472,6 +4909,12 @@ static bool llm_load_tensors(
|
|
4472
4909
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4473
4910
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
4474
4911
|
|
4912
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
4913
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
4914
|
+
|
4915
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
4916
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
4917
|
+
|
4475
4918
|
// AWQ ScaleActivation layer
|
4476
4919
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
4477
4920
|
}
|
@@ -4918,6 +5361,59 @@ static bool llm_load_tensors(
|
|
4918
5361
|
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
4919
5362
|
}
|
4920
5363
|
} break;
|
5364
|
+
case LLM_ARCH_XVERSE:
|
5365
|
+
{
|
5366
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5367
|
+
{
|
5368
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5369
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5370
|
+
}
|
5371
|
+
for (int i = 0; i < n_layer; ++i) {
|
5372
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5373
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5374
|
+
auto & layer = model.layers[i];
|
5375
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5376
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5377
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5378
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5379
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5380
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5381
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5382
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5383
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5384
|
+
}
|
5385
|
+
} break;
|
5386
|
+
case LLM_ARCH_COMMAND_R:
|
5387
|
+
{
|
5388
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5389
|
+
|
5390
|
+
// output
|
5391
|
+
{
|
5392
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5393
|
+
// init output from the input tok embed
|
5394
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5395
|
+
ml.n_created--; // artificial tensor
|
5396
|
+
ml.size_data += ggml_nbytes(model.output);
|
5397
|
+
}
|
5398
|
+
|
5399
|
+
for (int i = 0; i < n_layer; ++i) {
|
5400
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5401
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5402
|
+
|
5403
|
+
auto & layer = model.layers[i];
|
5404
|
+
|
5405
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5406
|
+
|
5407
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5408
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5409
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5410
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5411
|
+
|
5412
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5413
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5414
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5415
|
+
}
|
5416
|
+
} break;
|
4921
5417
|
default:
|
4922
5418
|
throw std::runtime_error("unknown architecture");
|
4923
5419
|
}
|
@@ -4925,49 +5421,97 @@ static bool llm_load_tensors(
|
|
4925
5421
|
|
4926
5422
|
ml.done_getting_tensors();
|
4927
5423
|
|
4928
|
-
ml.
|
5424
|
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
5425
|
+
model.mappings.reserve(ml.mappings.size());
|
4929
5426
|
|
4930
5427
|
// create the backend buffers
|
4931
|
-
std::vector<std::pair<ggml_context *,
|
5428
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
5429
|
+
ctx_bufs.reserve(ctx_map.size());
|
5430
|
+
|
5431
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
5432
|
+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
5433
|
+
model.bufs.reserve(n_max_backend_buffer);
|
4932
5434
|
|
4933
5435
|
for (auto & it : ctx_map) {
|
4934
5436
|
ggml_backend_buffer_type_t buft = it.first;
|
4935
|
-
ggml_context * ctx
|
4936
|
-
|
5437
|
+
ggml_context * ctx = it.second;
|
5438
|
+
|
5439
|
+
llama_buf_map bufs;
|
5440
|
+
bufs.reserve(n_max_backend_buffer);
|
4937
5441
|
|
4938
5442
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
4939
5443
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
4940
5444
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
4941
|
-
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
4942
|
-
|
4943
|
-
|
4944
|
-
|
5445
|
+
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
5446
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5447
|
+
void * addr = nullptr;
|
5448
|
+
size_t first, last;
|
5449
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5450
|
+
if (first >= last) {
|
5451
|
+
continue;
|
5452
|
+
}
|
5453
|
+
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
|
5454
|
+
if (buf == nullptr) {
|
5455
|
+
throw std::runtime_error("unable to allocate backend CPU buffer");
|
5456
|
+
}
|
5457
|
+
model.bufs.push_back(buf);
|
5458
|
+
bufs.emplace(idx, buf);
|
5459
|
+
#ifdef GGML_USE_CUDA
|
5460
|
+
if (n_layer >= n_gpu_layers) {
|
5461
|
+
ggml_backend_cuda_register_host_buffer(
|
5462
|
+
ggml_backend_buffer_get_base(buf),
|
5463
|
+
ggml_backend_buffer_get_size(buf));
|
5464
|
+
}
|
5465
|
+
#endif
|
5466
|
+
}
|
4945
5467
|
}
|
4946
5468
|
#ifdef GGML_USE_METAL
|
4947
|
-
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
4948
|
-
|
4949
|
-
|
4950
|
-
|
4951
|
-
|
5469
|
+
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
5470
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5471
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
5472
|
+
void * addr = nullptr;
|
5473
|
+
size_t first, last;
|
5474
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5475
|
+
if (first >= last) {
|
5476
|
+
continue;
|
5477
|
+
}
|
5478
|
+
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
|
5479
|
+
if (buf == nullptr) {
|
5480
|
+
throw std::runtime_error("unable to allocate backend metal buffer");
|
5481
|
+
}
|
5482
|
+
model.bufs.push_back(buf);
|
5483
|
+
bufs.emplace(idx, buf);
|
5484
|
+
}
|
4952
5485
|
}
|
4953
5486
|
#endif
|
4954
5487
|
else {
|
4955
|
-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
4956
|
-
if (buf
|
5488
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5489
|
+
if (buf == nullptr) {
|
5490
|
+
throw std::runtime_error("unable to allocate backend buffer");
|
5491
|
+
}
|
5492
|
+
model.bufs.push_back(buf);
|
5493
|
+
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
4957
5494
|
model.mlock_bufs.emplace_back(new llama_mlock);
|
4958
5495
|
auto & mlock_buf = model.mlock_bufs.back();
|
4959
5496
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
4960
5497
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
4961
5498
|
}
|
5499
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5500
|
+
bufs.emplace(idx, buf);
|
5501
|
+
}
|
4962
5502
|
}
|
4963
|
-
|
5503
|
+
|
5504
|
+
if (bufs.empty()) {
|
4964
5505
|
throw std::runtime_error("failed to allocate buffer");
|
4965
5506
|
}
|
4966
|
-
|
4967
|
-
|
4968
|
-
|
4969
|
-
|
4970
|
-
|
5507
|
+
|
5508
|
+
for (auto & buf : bufs) {
|
5509
|
+
// indicate that this buffer contains weights
|
5510
|
+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
5511
|
+
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
5512
|
+
}
|
5513
|
+
|
5514
|
+
ctx_bufs.emplace_back(ctx, bufs);
|
4971
5515
|
}
|
4972
5516
|
|
4973
5517
|
if (llama_supports_gpu_offload()) {
|
@@ -4999,13 +5543,17 @@ static bool llm_load_tensors(
|
|
4999
5543
|
// load tensor data
|
5000
5544
|
for (auto & it : ctx_bufs) {
|
5001
5545
|
ggml_context * ctx = it.first;
|
5002
|
-
|
5003
|
-
if (!ml.load_all_data(ctx,
|
5546
|
+
auto & bufs = it.second;
|
5547
|
+
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
5004
5548
|
return false;
|
5005
5549
|
}
|
5006
5550
|
}
|
5007
5551
|
|
5008
|
-
|
5552
|
+
if (use_mmap_buffer) {
|
5553
|
+
for (auto & mapping : ml.mappings) {
|
5554
|
+
model.mappings.emplace_back(std::move(mapping));
|
5555
|
+
}
|
5556
|
+
}
|
5009
5557
|
|
5010
5558
|
// loading time will be recalculate after the first eval, so
|
5011
5559
|
// we take page faults deferred by mmap() into consideration
|
@@ -5064,6 +5612,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
5064
5612
|
}
|
5065
5613
|
#endif
|
5066
5614
|
|
5615
|
+
#ifdef GGML_USE_SYCL
|
5616
|
+
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
5617
|
+
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
5618
|
+
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
5619
|
+
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
5620
|
+
} else {
|
5621
|
+
ggml_backend_sycl_set_mul_device_mode();
|
5622
|
+
}
|
5623
|
+
#endif
|
5624
|
+
|
5067
5625
|
if (!llm_load_tensors(
|
5068
5626
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
5069
5627
|
params.progress_callback, params.progress_callback_user_data
|
@@ -5150,8 +5708,8 @@ static void llm_build_kv_store(
|
|
5150
5708
|
GGML_ASSERT(kv.size == n_ctx);
|
5151
5709
|
|
5152
5710
|
// compute the transposed [n_tokens, n_embd] V matrix
|
5153
|
-
|
5154
|
-
|
5711
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
5712
|
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
5155
5713
|
cb(v_cur_t, "v_cur_t", il);
|
5156
5714
|
|
5157
5715
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
@@ -5335,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
5335
5893
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5336
5894
|
}
|
5337
5895
|
|
5896
|
+
if (model.arch == LLM_ARCH_GROK) {
|
5897
|
+
// need to do the following:
|
5898
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
5899
|
+
// and then :
|
5900
|
+
// kq = 30 * tanh(kq / 30)
|
5901
|
+
// before the softmax below
|
5902
|
+
|
5903
|
+
//try from phi2
|
5904
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5905
|
+
|
5906
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
5907
|
+
kq = ggml_scale(ctx, kq, 30);
|
5908
|
+
}
|
5909
|
+
|
5338
5910
|
#if defined(GGML_USE_KOMPUTE)
|
5339
5911
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
5340
5912
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
@@ -5461,7 +6033,8 @@ struct llm_build_context {
|
|
5461
6033
|
const float norm_rms_eps;
|
5462
6034
|
|
5463
6035
|
const int32_t n_tokens;
|
5464
|
-
const int32_t n_kv; // size of KV cache to consider (n_kv <=
|
6036
|
+
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
6037
|
+
const int32_t n_outputs;
|
5465
6038
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5466
6039
|
const int32_t n_orig_ctx;
|
5467
6040
|
|
@@ -5508,6 +6081,7 @@ struct llm_build_context {
|
|
5508
6081
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
5509
6082
|
n_tokens (batch.n_tokens),
|
5510
6083
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
6084
|
+
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
5511
6085
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
5512
6086
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5513
6087
|
pooling_type (cparams.pooling_type),
|
@@ -5529,6 +6103,7 @@ struct llm_build_context {
|
|
5529
6103
|
lctx.inp_tokens = nullptr;
|
5530
6104
|
lctx.inp_embd = nullptr;
|
5531
6105
|
lctx.inp_pos = nullptr;
|
6106
|
+
lctx.inp_out_ids = nullptr;
|
5532
6107
|
lctx.inp_KQ_mask = nullptr;
|
5533
6108
|
lctx.inp_KQ_pos = nullptr;
|
5534
6109
|
lctx.inp_K_shift = nullptr;
|
@@ -5652,6 +6227,13 @@ struct llm_build_context {
|
|
5652
6227
|
return lctx.inp_pos;
|
5653
6228
|
}
|
5654
6229
|
|
6230
|
+
struct ggml_tensor * build_inp_out_ids() {
|
6231
|
+
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
6232
|
+
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
6233
|
+
ggml_set_input(lctx.inp_out_ids);
|
6234
|
+
return lctx.inp_out_ids;
|
6235
|
+
}
|
6236
|
+
|
5655
6237
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
5656
6238
|
if (causal) {
|
5657
6239
|
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
|
@@ -5708,6 +6290,9 @@ struct llm_build_context {
|
|
5708
6290
|
struct ggml_cgraph * build_llama() {
|
5709
6291
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5710
6292
|
|
6293
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6294
|
+
int32_t n_tokens = this->n_tokens;
|
6295
|
+
|
5711
6296
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5712
6297
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5713
6298
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
@@ -5775,6 +6360,14 @@ struct llm_build_context {
|
|
5775
6360
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5776
6361
|
}
|
5777
6362
|
|
6363
|
+
if (il == n_layer - 1) {
|
6364
|
+
// skip computing output for unused tokens
|
6365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6366
|
+
n_tokens = n_outputs;
|
6367
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6368
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6369
|
+
}
|
6370
|
+
|
5778
6371
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5779
6372
|
cb(ffn_inp, "ffn_inp", il);
|
5780
6373
|
|
@@ -5827,19 +6420,19 @@ struct llm_build_context {
|
|
5827
6420
|
for (int i = 0; i < n_expert_used; ++i) {
|
5828
6421
|
ggml_tensor * cur_expert;
|
5829
6422
|
|
5830
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].
|
6423
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
5831
6424
|
cb(cur_up, "ffn_moe_up", il);
|
5832
6425
|
|
5833
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].
|
6426
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
5834
6427
|
cb(cur_gate, "ffn_moe_gate", il);
|
5835
6428
|
|
5836
6429
|
cur_gate = ggml_silu(ctx0, cur_gate);
|
5837
6430
|
cb(cur_gate, "ffn_moe_silu", il);
|
5838
6431
|
|
5839
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6432
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
5840
6433
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
5841
6434
|
|
5842
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].
|
6435
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
5843
6436
|
cb(cur_expert, "ffn_moe_down", il);
|
5844
6437
|
|
5845
6438
|
cur_expert = ggml_mul(ctx0, cur_expert,
|
@@ -5858,6 +6451,12 @@ struct llm_build_context {
|
|
5858
6451
|
}
|
5859
6452
|
|
5860
6453
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
6454
|
+
cb(cur, "ffn_out", il);
|
6455
|
+
|
6456
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
6457
|
+
if (layer_dir != nullptr) {
|
6458
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
6459
|
+
}
|
5861
6460
|
cb(cur, "l_out", il);
|
5862
6461
|
|
5863
6462
|
// input for next layer
|
@@ -5893,7 +6492,7 @@ struct llm_build_context {
|
|
5893
6492
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5894
6493
|
|
5895
6494
|
// inp_pos - contains the positions
|
5896
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
6495
|
+
struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
|
5897
6496
|
|
5898
6497
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5899
6498
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
@@ -5943,12 +6542,18 @@ struct llm_build_context {
|
|
5943
6542
|
cb(Qcur, "Qcur", il);
|
5944
6543
|
cb(Kcur, "Kcur", il);
|
5945
6544
|
|
5946
|
-
|
5947
6545
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5948
6546
|
model.layers[il].wo, NULL,
|
5949
6547
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5950
6548
|
}
|
5951
6549
|
|
6550
|
+
if (il == n_layer - 1) {
|
6551
|
+
// skip computing output for unused tokens
|
6552
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6553
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6554
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6555
|
+
}
|
6556
|
+
|
5952
6557
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5953
6558
|
cb(ffn_inp, "ffn_inp", il);
|
5954
6559
|
|
@@ -5991,6 +6596,111 @@ struct llm_build_context {
|
|
5991
6596
|
return gf;
|
5992
6597
|
}
|
5993
6598
|
|
6599
|
+
struct ggml_cgraph * build_xverse() {
|
6600
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6601
|
+
|
6602
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6603
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6604
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6605
|
+
|
6606
|
+
struct ggml_tensor * cur;
|
6607
|
+
struct ggml_tensor * inpL;
|
6608
|
+
|
6609
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6610
|
+
|
6611
|
+
// inp_pos - contains the positions
|
6612
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6613
|
+
|
6614
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6615
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6616
|
+
|
6617
|
+
// positions of the tokens in the KV cache
|
6618
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6619
|
+
|
6620
|
+
for (int il = 0; il < n_layer; ++il) {
|
6621
|
+
struct ggml_tensor * inpSA = inpL;
|
6622
|
+
|
6623
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6624
|
+
model.layers[il].attn_norm, NULL,
|
6625
|
+
LLM_NORM_RMS, cb, il);
|
6626
|
+
cb(cur, "attn_norm", il);
|
6627
|
+
|
6628
|
+
// self-attention
|
6629
|
+
{
|
6630
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6631
|
+
cb(Qcur, "Qcur", il);
|
6632
|
+
|
6633
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6634
|
+
cb(Kcur, "Kcur", il);
|
6635
|
+
|
6636
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6637
|
+
cb(Vcur, "Vcur", il);
|
6638
|
+
|
6639
|
+
Qcur = ggml_rope_custom(
|
6640
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6641
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6642
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6643
|
+
);
|
6644
|
+
cb(Qcur, "Qcur", il);
|
6645
|
+
|
6646
|
+
Kcur = ggml_rope_custom(
|
6647
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6648
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6649
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6650
|
+
);
|
6651
|
+
cb(Kcur, "Kcur", il);
|
6652
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6653
|
+
model.layers[il].wo, NULL,
|
6654
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6655
|
+
}
|
6656
|
+
|
6657
|
+
if (il == n_layer - 1) {
|
6658
|
+
// skip computing output for unused tokens
|
6659
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6660
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6661
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6662
|
+
}
|
6663
|
+
|
6664
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6665
|
+
cb(ffn_inp, "ffn_inp", il);
|
6666
|
+
|
6667
|
+
// feed-forward network
|
6668
|
+
{
|
6669
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6670
|
+
model.layers[il].ffn_norm, NULL,
|
6671
|
+
LLM_NORM_RMS, cb, il);
|
6672
|
+
cb(cur, "ffn_norm", il);
|
6673
|
+
|
6674
|
+
cur = llm_build_ffn(ctx0, cur,
|
6675
|
+
model.layers[il].ffn_up, NULL,
|
6676
|
+
model.layers[il].ffn_gate, NULL,
|
6677
|
+
model.layers[il].ffn_down, NULL,
|
6678
|
+
NULL,
|
6679
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6680
|
+
cb(cur, "ffn_out", il);
|
6681
|
+
}
|
6682
|
+
|
6683
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6684
|
+
cb(cur, "l_out", il);
|
6685
|
+
|
6686
|
+
// input for next layer
|
6687
|
+
inpL = cur;
|
6688
|
+
}
|
6689
|
+
|
6690
|
+
cur = inpL;
|
6691
|
+
|
6692
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
6693
|
+
cb(cur, "result_norm", -1);
|
6694
|
+
|
6695
|
+
// lm_head
|
6696
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6697
|
+
cb(cur, "result_output", -1);
|
6698
|
+
|
6699
|
+
ggml_build_forward_expand(gf, cur);
|
6700
|
+
|
6701
|
+
return gf;
|
6702
|
+
}
|
6703
|
+
|
5994
6704
|
struct ggml_cgraph * build_falcon() {
|
5995
6705
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5996
6706
|
|
@@ -6064,6 +6774,14 @@ struct llm_build_context {
|
|
6064
6774
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6065
6775
|
}
|
6066
6776
|
|
6777
|
+
if (il == n_layer - 1) {
|
6778
|
+
// skip computing output for unused tokens
|
6779
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6780
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6781
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6782
|
+
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
6783
|
+
}
|
6784
|
+
|
6067
6785
|
struct ggml_tensor * ffn_inp = cur;
|
6068
6786
|
|
6069
6787
|
// feed forward
|
@@ -6104,6 +6822,214 @@ struct llm_build_context {
|
|
6104
6822
|
return gf;
|
6105
6823
|
}
|
6106
6824
|
|
6825
|
+
struct ggml_cgraph * build_grok() {
|
6826
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6827
|
+
|
6828
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6829
|
+
int32_t n_tokens = this->n_tokens;
|
6830
|
+
|
6831
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6832
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6833
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6834
|
+
|
6835
|
+
struct ggml_tensor * cur;
|
6836
|
+
struct ggml_tensor * inpL;
|
6837
|
+
|
6838
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6839
|
+
|
6840
|
+
// multiply by embedding_multiplier_scale of 78.38367176906169
|
6841
|
+
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
|
6842
|
+
|
6843
|
+
// inp_pos - contains the positions
|
6844
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6845
|
+
|
6846
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6847
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6848
|
+
|
6849
|
+
for (int il = 0; il < n_layer; ++il) {
|
6850
|
+
struct ggml_tensor * inpSA = inpL;
|
6851
|
+
|
6852
|
+
// norm
|
6853
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6854
|
+
model.layers[il].attn_norm, NULL,
|
6855
|
+
LLM_NORM_RMS, cb, il);
|
6856
|
+
cb(cur, "attn_norm", il);
|
6857
|
+
|
6858
|
+
|
6859
|
+
// self-attention
|
6860
|
+
{
|
6861
|
+
// compute Q and K and RoPE them
|
6862
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6863
|
+
cb(Qcur, "Qcur", il);
|
6864
|
+
if (model.layers[il].bq) {
|
6865
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6866
|
+
cb(Qcur, "Qcur", il);
|
6867
|
+
}
|
6868
|
+
|
6869
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6870
|
+
cb(Kcur, "Kcur", il);
|
6871
|
+
if (model.layers[il].bk) {
|
6872
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6873
|
+
cb(Kcur, "Kcur", il);
|
6874
|
+
}
|
6875
|
+
|
6876
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6877
|
+
cb(Vcur, "Vcur", il);
|
6878
|
+
if (model.layers[il].bv) {
|
6879
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6880
|
+
cb(Vcur, "Vcur", il);
|
6881
|
+
}
|
6882
|
+
|
6883
|
+
Qcur = ggml_rope_custom(
|
6884
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6885
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6886
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6887
|
+
);
|
6888
|
+
cb(Qcur, "Qcur", il);
|
6889
|
+
|
6890
|
+
Kcur = ggml_rope_custom(
|
6891
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6892
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6893
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6894
|
+
);
|
6895
|
+
cb(Kcur, "Kcur", il);
|
6896
|
+
|
6897
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6898
|
+
model.layers[il].wo, model.layers[il].bo,
|
6899
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6900
|
+
}
|
6901
|
+
|
6902
|
+
if (il == n_layer - 1) {
|
6903
|
+
// skip computing output for unused tokens
|
6904
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6905
|
+
n_tokens = n_outputs;
|
6906
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6907
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6908
|
+
}
|
6909
|
+
|
6910
|
+
// Grok
|
6911
|
+
// if attn_out_norm is present then apply it before adding the input
|
6912
|
+
if (model.layers[il].attn_out_norm) {
|
6913
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6914
|
+
model.layers[il].attn_out_norm, NULL,
|
6915
|
+
LLM_NORM_RMS, cb, il);
|
6916
|
+
cb(cur, "attn_out_norm", il);
|
6917
|
+
}
|
6918
|
+
|
6919
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6920
|
+
cb(ffn_inp, "ffn_inp", il);
|
6921
|
+
|
6922
|
+
// feed-forward network
|
6923
|
+
// MoE branch
|
6924
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6925
|
+
model.layers[il].ffn_norm, NULL,
|
6926
|
+
LLM_NORM_RMS, cb, il);
|
6927
|
+
cb(cur, "ffn_norm", il);
|
6928
|
+
|
6929
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
6930
|
+
cb(logits, "ffn_moe_logits", il);
|
6931
|
+
|
6932
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
6933
|
+
cb(probs, "ffn_moe_probs", il);
|
6934
|
+
|
6935
|
+
// select experts
|
6936
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
6937
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6938
|
+
|
6939
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6940
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6941
|
+
cb(weights, "ffn_moe_weights", il);
|
6942
|
+
|
6943
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6944
|
+
|
6945
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6946
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6947
|
+
|
6948
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6949
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6950
|
+
|
6951
|
+
// compute expert outputs
|
6952
|
+
ggml_tensor * moe_out = nullptr;
|
6953
|
+
|
6954
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6955
|
+
ggml_tensor * cur_expert;
|
6956
|
+
|
6957
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6958
|
+
cb(cur_up, "ffn_moe_up", il);
|
6959
|
+
|
6960
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6961
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
6962
|
+
|
6963
|
+
//GeLU
|
6964
|
+
cur_gate = ggml_gelu(ctx0, cur_gate);
|
6965
|
+
cb(cur_gate, "ffn_moe_gelu", il);
|
6966
|
+
|
6967
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6968
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
6969
|
+
|
6970
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6971
|
+
cb(cur_expert, "ffn_moe_down", il);
|
6972
|
+
|
6973
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
6974
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6975
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
6976
|
+
|
6977
|
+
if (i == 0) {
|
6978
|
+
moe_out = cur_expert;
|
6979
|
+
} else {
|
6980
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6981
|
+
cb(moe_out, "ffn_moe_out", il);
|
6982
|
+
}
|
6983
|
+
}
|
6984
|
+
|
6985
|
+
cur = moe_out;
|
6986
|
+
|
6987
|
+
// Grok
|
6988
|
+
// if layer_out_norm is present then apply it before adding the input
|
6989
|
+
// Idea: maybe ffn_out_norm is a better name
|
6990
|
+
if (model.layers[il].layer_out_norm) {
|
6991
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6992
|
+
model.layers[il].layer_out_norm, NULL,
|
6993
|
+
LLM_NORM_RMS, cb, il);
|
6994
|
+
cb(cur, "layer_out_norm", il);
|
6995
|
+
}
|
6996
|
+
|
6997
|
+
|
6998
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6999
|
+
cb(cur, "ffn_out", il);
|
7000
|
+
|
7001
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7002
|
+
if (layer_dir != nullptr) {
|
7003
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7004
|
+
}
|
7005
|
+
cb(cur, "l_out", il);
|
7006
|
+
|
7007
|
+
// input for next layer
|
7008
|
+
inpL = cur;
|
7009
|
+
}
|
7010
|
+
|
7011
|
+
cur = inpL;
|
7012
|
+
|
7013
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7014
|
+
model.output_norm, NULL,
|
7015
|
+
LLM_NORM_RMS, cb, -1);
|
7016
|
+
cb(cur, "result_norm", -1);
|
7017
|
+
|
7018
|
+
// lm_head
|
7019
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7020
|
+
|
7021
|
+
// Grok
|
7022
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7023
|
+
|
7024
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7025
|
+
|
7026
|
+
cb(cur, "result_output", -1);
|
7027
|
+
|
7028
|
+
ggml_build_forward_expand(gf, cur);
|
7029
|
+
|
7030
|
+
return gf;
|
7031
|
+
}
|
7032
|
+
|
6107
7033
|
struct ggml_cgraph * build_starcoder() {
|
6108
7034
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6109
7035
|
|
@@ -6158,6 +7084,13 @@ struct llm_build_context {
|
|
6158
7084
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6159
7085
|
}
|
6160
7086
|
|
7087
|
+
if (il == n_layer - 1) {
|
7088
|
+
// skip computing output for unused tokens
|
7089
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7090
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7091
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7092
|
+
}
|
7093
|
+
|
6161
7094
|
// add the input
|
6162
7095
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6163
7096
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6355,6 +7288,13 @@ struct llm_build_context {
|
|
6355
7288
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6356
7289
|
}
|
6357
7290
|
|
7291
|
+
if (il == n_layer - 1) {
|
7292
|
+
// skip computing output for unused tokens
|
7293
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7294
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7295
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
7296
|
+
}
|
7297
|
+
|
6358
7298
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
6359
7299
|
cb(ffn_inp, "ffn_inp", il);
|
6360
7300
|
|
@@ -6444,6 +7384,13 @@ struct llm_build_context {
|
|
6444
7384
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6445
7385
|
}
|
6446
7386
|
|
7387
|
+
if (il == n_layer - 1) {
|
7388
|
+
// skip computing output for unused tokens
|
7389
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7390
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7391
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7392
|
+
}
|
7393
|
+
|
6447
7394
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6448
7395
|
cb(ffn_inp, "ffn_inp", il);
|
6449
7396
|
|
@@ -6601,6 +7548,13 @@ struct llm_build_context {
|
|
6601
7548
|
}
|
6602
7549
|
cb(cur, "kqv_out", il);
|
6603
7550
|
|
7551
|
+
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
7552
|
+
// skip computing output for unused tokens
|
7553
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7554
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7555
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7556
|
+
}
|
7557
|
+
|
6604
7558
|
// re-add the layer input
|
6605
7559
|
cur = ggml_add(ctx0, cur, inpL);
|
6606
7560
|
|
@@ -6723,6 +7677,13 @@ struct llm_build_context {
|
|
6723
7677
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6724
7678
|
}
|
6725
7679
|
|
7680
|
+
if (il == n_layer - 1) {
|
7681
|
+
// skip computing output for unused tokens
|
7682
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7683
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7684
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7685
|
+
}
|
7686
|
+
|
6726
7687
|
// Add the input
|
6727
7688
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6728
7689
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6770,6 +7731,7 @@ struct llm_build_context {
|
|
6770
7731
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6771
7732
|
|
6772
7733
|
struct ggml_tensor * cur;
|
7734
|
+
struct ggml_tensor * pos;
|
6773
7735
|
struct ggml_tensor * inpL;
|
6774
7736
|
|
6775
7737
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -6780,6 +7742,16 @@ struct llm_build_context {
|
|
6780
7742
|
// positions of the tokens in the KV cache
|
6781
7743
|
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6782
7744
|
|
7745
|
+
if (model.pos_embd) {
|
7746
|
+
// inp_pos - contains the positions
|
7747
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7748
|
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7749
|
+
cb(pos, "pos_embd", -1);
|
7750
|
+
|
7751
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7752
|
+
cb(inpL, "inpL", -1);
|
7753
|
+
}
|
7754
|
+
|
6783
7755
|
for (int il = 0; il < n_layer; ++il) {
|
6784
7756
|
struct ggml_tensor * attn_norm;
|
6785
7757
|
|
@@ -6814,11 +7786,39 @@ struct llm_build_context {
|
|
6814
7786
|
cb(Kcur, "Kcur", il);
|
6815
7787
|
cb(Vcur, "Vcur", il);
|
6816
7788
|
|
6817
|
-
|
7789
|
+
// Q/K Layernorm
|
7790
|
+
if (model.layers[il].attn_q_norm) {
|
7791
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
7792
|
+
model.layers[il].attn_q_norm,
|
7793
|
+
model.layers[il].attn_q_norm_b,
|
7794
|
+
LLM_NORM, cb, il);
|
7795
|
+
cb(Qcur, "Qcur", il);
|
6818
7796
|
|
6819
|
-
|
7797
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
7798
|
+
model.layers[il].attn_k_norm,
|
7799
|
+
model.layers[il].attn_k_norm_b,
|
7800
|
+
LLM_NORM, cb, il);
|
7801
|
+
cb(Kcur, "Kcur", il);
|
7802
|
+
|
7803
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7804
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7805
|
+
|
7806
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6820
7807
|
model.layers[il].wo, model.layers[il].bo,
|
6821
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7808
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7809
|
+
} else {
|
7810
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7811
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7812
|
+
model.layers[il].wo, model.layers[il].bo,
|
7813
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7814
|
+
}
|
7815
|
+
}
|
7816
|
+
|
7817
|
+
if (il == n_layer - 1) {
|
7818
|
+
// skip computing output for unused tokens
|
7819
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7820
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7821
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6822
7822
|
}
|
6823
7823
|
|
6824
7824
|
// Add the input
|
@@ -6934,6 +7934,13 @@ struct llm_build_context {
|
|
6934
7934
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6935
7935
|
}
|
6936
7936
|
|
7937
|
+
if (il == n_layer - 1) {
|
7938
|
+
// skip computing output for unused tokens
|
7939
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7940
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7941
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7942
|
+
}
|
7943
|
+
|
6937
7944
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6938
7945
|
cb(ffn_inp, "ffn_inp", il);
|
6939
7946
|
|
@@ -7040,6 +8047,13 @@ struct llm_build_context {
|
|
7040
8047
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7041
8048
|
}
|
7042
8049
|
|
8050
|
+
if (il == n_layer - 1) {
|
8051
|
+
// skip computing output for unused tokens
|
8052
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8053
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8054
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8055
|
+
}
|
8056
|
+
|
7043
8057
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7044
8058
|
cb(ffn_inp, "ffn_inp", il);
|
7045
8059
|
|
@@ -7152,6 +8166,13 @@ struct llm_build_context {
|
|
7152
8166
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7153
8167
|
}
|
7154
8168
|
|
8169
|
+
if (il == n_layer - 1) {
|
8170
|
+
// skip computing output for unused tokens
|
8171
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8172
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8173
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8174
|
+
}
|
8175
|
+
|
7155
8176
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7156
8177
|
cb(ffn_inp, "ffn_inp", il);
|
7157
8178
|
|
@@ -7270,6 +8291,14 @@ struct llm_build_context {
|
|
7270
8291
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7271
8292
|
}
|
7272
8293
|
|
8294
|
+
if (il == n_layer - 1) {
|
8295
|
+
// skip computing output for unused tokens
|
8296
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8297
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8298
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8299
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
8300
|
+
}
|
8301
|
+
|
7273
8302
|
// FF
|
7274
8303
|
{
|
7275
8304
|
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
@@ -7367,6 +8396,14 @@ struct llm_build_context {
|
|
7367
8396
|
|
7368
8397
|
cur = attention_norm;
|
7369
8398
|
|
8399
|
+
if (il == n_layer - 1) {
|
8400
|
+
// skip computing output for unused tokens
|
8401
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8402
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8403
|
+
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
|
8404
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8405
|
+
}
|
8406
|
+
|
7370
8407
|
// feed-forward network
|
7371
8408
|
{
|
7372
8409
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -7459,6 +8496,13 @@ struct llm_build_context {
|
|
7459
8496
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7460
8497
|
}
|
7461
8498
|
|
8499
|
+
if (il == n_layer - 1) {
|
8500
|
+
// skip computing output for unused tokens
|
8501
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8502
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8503
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8504
|
+
}
|
8505
|
+
|
7462
8506
|
// add the input
|
7463
8507
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7464
8508
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7559,6 +8603,13 @@ struct llm_build_context {
|
|
7559
8603
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7560
8604
|
}
|
7561
8605
|
|
8606
|
+
if (il == n_layer - 1) {
|
8607
|
+
// skip computing output for unused tokens
|
8608
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8609
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8610
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8611
|
+
}
|
8612
|
+
|
7562
8613
|
// add the input
|
7563
8614
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7564
8615
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7668,6 +8719,13 @@ struct llm_build_context {
|
|
7668
8719
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7669
8720
|
}
|
7670
8721
|
|
8722
|
+
if (il == n_layer - 1) {
|
8723
|
+
// skip computing output for unused tokens
|
8724
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8725
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8726
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8727
|
+
}
|
8728
|
+
|
7671
8729
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7672
8730
|
cb(ffn_inp, "ffn_inp", il);
|
7673
8731
|
|
@@ -7778,6 +8836,13 @@ struct llm_build_context {
|
|
7778
8836
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7779
8837
|
}
|
7780
8838
|
|
8839
|
+
if (il == n_layer - 1) {
|
8840
|
+
// skip computing output for unused tokens
|
8841
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8842
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8843
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8844
|
+
}
|
8845
|
+
|
7781
8846
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7782
8847
|
cb(ffn_inp, "ffn_inp", il);
|
7783
8848
|
|
@@ -7901,6 +8966,13 @@ struct llm_build_context {
|
|
7901
8966
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7902
8967
|
}
|
7903
8968
|
|
8969
|
+
if (il == n_layer - 1) {
|
8970
|
+
// skip computing output for unused tokens
|
8971
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8972
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8973
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8974
|
+
}
|
8975
|
+
|
7904
8976
|
// scale_res - scale the hidden states for residual connection
|
7905
8977
|
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
7906
8978
|
cur = ggml_scale(ctx0, cur, scale_res);
|
@@ -8015,6 +9087,13 @@ struct llm_build_context {
|
|
8015
9087
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8016
9088
|
}
|
8017
9089
|
|
9090
|
+
if (il == n_layer - 1) {
|
9091
|
+
// skip computing output for unused tokens
|
9092
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9093
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9094
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9095
|
+
}
|
9096
|
+
|
8018
9097
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
8019
9098
|
cb(sa_out, "sa_out", il);
|
8020
9099
|
|
@@ -8125,7 +9204,13 @@ struct llm_build_context {
|
|
8125
9204
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8126
9205
|
model.layers[il].wo, model.layers[il].bo,
|
8127
9206
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8128
|
-
|
9207
|
+
}
|
9208
|
+
|
9209
|
+
if (il == n_layer - 1) {
|
9210
|
+
// skip computing output for unused tokens
|
9211
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9212
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9213
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8129
9214
|
}
|
8130
9215
|
|
8131
9216
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -8275,6 +9360,15 @@ struct llm_build_context {
|
|
8275
9360
|
|
8276
9361
|
struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
|
8277
9362
|
|
9363
|
+
if (il == n_layer - 1) {
|
9364
|
+
// skip computing output for unused tokens
|
9365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9366
|
+
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
9367
|
+
y = ggml_get_rows(ctx0, y, inp_out_ids);
|
9368
|
+
z = ggml_get_rows(ctx0, z, inp_out_ids);
|
9369
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9370
|
+
}
|
9371
|
+
|
8278
9372
|
// {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
|
8279
9373
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
8280
9374
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
|
@@ -8305,6 +9399,129 @@ struct llm_build_context {
|
|
8305
9399
|
|
8306
9400
|
return gf;
|
8307
9401
|
}
|
9402
|
+
|
9403
|
+
struct ggml_cgraph * build_command_r() {
|
9404
|
+
|
9405
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
9406
|
+
|
9407
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9408
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
9409
|
+
const float f_logit_scale = hparams.f_logit_scale;
|
9410
|
+
|
9411
|
+
struct ggml_tensor * cur;
|
9412
|
+
struct ggml_tensor * inpL;
|
9413
|
+
|
9414
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
9415
|
+
|
9416
|
+
// inp_pos - contains the positions
|
9417
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
9418
|
+
|
9419
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
9420
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
9421
|
+
|
9422
|
+
for (int il = 0; il < n_layer; ++il) {
|
9423
|
+
|
9424
|
+
// norm
|
9425
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9426
|
+
model.layers[il].attn_norm, NULL,
|
9427
|
+
LLM_NORM, cb, il);
|
9428
|
+
cb(cur, "attn_norm", il);
|
9429
|
+
struct ggml_tensor * ffn_inp = cur;
|
9430
|
+
|
9431
|
+
// self-attention
|
9432
|
+
{
|
9433
|
+
// compute Q and K and RoPE them
|
9434
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
9435
|
+
cb(Qcur, "Qcur", il);
|
9436
|
+
if (model.layers[il].bq) {
|
9437
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
9438
|
+
cb(Qcur, "Qcur", il);
|
9439
|
+
}
|
9440
|
+
|
9441
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
9442
|
+
cb(Kcur, "Kcur", il);
|
9443
|
+
if (model.layers[il].bk) {
|
9444
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
9445
|
+
cb(Kcur, "Kcur", il);
|
9446
|
+
}
|
9447
|
+
|
9448
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9449
|
+
cb(Vcur, "Vcur", il);
|
9450
|
+
if (model.layers[il].bv) {
|
9451
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9452
|
+
cb(Vcur, "Vcur", il);
|
9453
|
+
}
|
9454
|
+
|
9455
|
+
Qcur = ggml_rope_custom(
|
9456
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9457
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9458
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
9459
|
+
);
|
9460
|
+
cb(Qcur, "Qcur", il);
|
9461
|
+
|
9462
|
+
Kcur = ggml_rope_custom(
|
9463
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9464
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9465
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
9466
|
+
);
|
9467
|
+
cb(Kcur, "Kcur", il);
|
9468
|
+
|
9469
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9470
|
+
model.layers[il].wo, model.layers[il].bo,
|
9471
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9472
|
+
}
|
9473
|
+
|
9474
|
+
if (il == n_layer - 1) {
|
9475
|
+
// skip computing output for unused tokens
|
9476
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9477
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9478
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9479
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
9480
|
+
}
|
9481
|
+
|
9482
|
+
struct ggml_tensor * attn_out = cur;
|
9483
|
+
|
9484
|
+
// feed-forward network
|
9485
|
+
{
|
9486
|
+
cur = llm_build_ffn(ctx0, ffn_inp,
|
9487
|
+
model.layers[il].ffn_up, NULL,
|
9488
|
+
model.layers[il].ffn_gate, NULL,
|
9489
|
+
model.layers[il].ffn_down, NULL,
|
9490
|
+
NULL,
|
9491
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
9492
|
+
cb(cur, "ffn_out", il);
|
9493
|
+
}
|
9494
|
+
|
9495
|
+
// add together residual + FFN + self-attention
|
9496
|
+
cur = ggml_add(ctx0, cur, inpL);
|
9497
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
9498
|
+
cb(cur, "l_out", il);
|
9499
|
+
|
9500
|
+
// input for next layer
|
9501
|
+
inpL = cur;
|
9502
|
+
}
|
9503
|
+
|
9504
|
+
cur = inpL;
|
9505
|
+
|
9506
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9507
|
+
model.output_norm, NULL,
|
9508
|
+
LLM_NORM, cb, -1);
|
9509
|
+
cb(cur, "result_norm", -1);
|
9510
|
+
|
9511
|
+
// lm_head
|
9512
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
9513
|
+
|
9514
|
+
if (f_logit_scale) {
|
9515
|
+
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
9516
|
+
}
|
9517
|
+
|
9518
|
+
cb(cur, "result_output", -1);
|
9519
|
+
|
9520
|
+
ggml_build_forward_expand(gf, cur);
|
9521
|
+
|
9522
|
+
return gf;
|
9523
|
+
|
9524
|
+
}
|
8308
9525
|
};
|
8309
9526
|
|
8310
9527
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -8380,12 +9597,15 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8380
9597
|
}
|
8381
9598
|
|
8382
9599
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
8383
|
-
//
|
8384
|
-
|
8385
|
-
|
8386
|
-
|
8387
|
-
|
8388
|
-
|
9600
|
+
// FIXME: fix in ggml_backend_sched
|
9601
|
+
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
9602
|
+
if (batch.n_tokens < 32 || full_offload) {
|
9603
|
+
if (il != -1 && strcmp(name, "norm") == 0) {
|
9604
|
+
for (auto * backend : lctx.backends) {
|
9605
|
+
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
9606
|
+
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
9607
|
+
break;
|
9608
|
+
}
|
8389
9609
|
}
|
8390
9610
|
}
|
8391
9611
|
}
|
@@ -8410,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8410
9630
|
{
|
8411
9631
|
result = llm.build_falcon();
|
8412
9632
|
} break;
|
9633
|
+
case LLM_ARCH_GROK:
|
9634
|
+
{
|
9635
|
+
result = llm.build_grok();
|
9636
|
+
} break;
|
8413
9637
|
case LLM_ARCH_STARCODER:
|
8414
9638
|
{
|
8415
9639
|
result = llm.build_starcoder();
|
@@ -8487,6 +9711,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8487
9711
|
{
|
8488
9712
|
result = llm.build_mamba();
|
8489
9713
|
} break;
|
9714
|
+
case LLM_ARCH_XVERSE:
|
9715
|
+
{
|
9716
|
+
result = llm.build_xverse();
|
9717
|
+
} break;
|
9718
|
+
case LLM_ARCH_COMMAND_R:
|
9719
|
+
{
|
9720
|
+
result = llm.build_command_r();
|
9721
|
+
} break;
|
8490
9722
|
default:
|
8491
9723
|
GGML_ASSERT(false);
|
8492
9724
|
}
|
@@ -8548,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8548
9780
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
8549
9781
|
}
|
8550
9782
|
|
9783
|
+
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
9784
|
+
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
9785
|
+
const int64_t n_tokens = batch.n_tokens;
|
9786
|
+
|
9787
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
9788
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
9789
|
+
|
9790
|
+
if (lctx.n_outputs == n_tokens) {
|
9791
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9792
|
+
data[i] = i;
|
9793
|
+
}
|
9794
|
+
} else if (batch.logits) {
|
9795
|
+
int32_t n_outputs = 0;
|
9796
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9797
|
+
if (batch.logits[i]) {
|
9798
|
+
data[n_outputs++] = i;
|
9799
|
+
}
|
9800
|
+
}
|
9801
|
+
// the graph needs to have been passed the correct number of outputs
|
9802
|
+
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
9803
|
+
} else if (lctx.n_outputs == 1) {
|
9804
|
+
// only keep last output
|
9805
|
+
data[0] = n_tokens - 1;
|
9806
|
+
} else {
|
9807
|
+
GGML_ASSERT(lctx.n_outputs == 0);
|
9808
|
+
}
|
9809
|
+
}
|
9810
|
+
|
8551
9811
|
GGML_ASSERT(
|
9812
|
+
// (!a || b) is a logical implication (a -> b)
|
9813
|
+
// !hparams.causal_attn -> !cparams.causal_attn
|
8552
9814
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
8553
|
-
"
|
9815
|
+
"causal attention with embedding models is not supported"
|
8554
9816
|
);
|
8555
9817
|
|
8556
9818
|
if (lctx.inp_KQ_mask) {
|
@@ -8729,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8729
9991
|
}
|
8730
9992
|
}
|
8731
9993
|
|
9994
|
+
// Make sure enough space is available for outputs.
|
9995
|
+
// Returns max number of outputs for which space was reserved.
|
9996
|
+
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
9997
|
+
const auto & cparams = lctx.cparams;
|
9998
|
+
const auto & hparams = lctx.model.hparams;
|
9999
|
+
|
10000
|
+
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
10001
|
+
|
10002
|
+
const auto n_batch = cparams.n_batch;
|
10003
|
+
const auto n_vocab = hparams.n_vocab;
|
10004
|
+
const auto n_embd = hparams.n_embd;
|
10005
|
+
|
10006
|
+
// TODO: use a per-batch flag for logits presence instead
|
10007
|
+
const bool has_logits = cparams.causal_attn;
|
10008
|
+
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
10009
|
+
|
10010
|
+
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
10011
|
+
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
10012
|
+
|
10013
|
+
if (lctx.output_ids.empty()) {
|
10014
|
+
// init, never resized afterwards
|
10015
|
+
lctx.output_ids.resize(n_batch);
|
10016
|
+
}
|
10017
|
+
|
10018
|
+
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
|
10019
|
+
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
10020
|
+
|
10021
|
+
// alloc only when more than the current capacity is required
|
10022
|
+
// TODO: also consider shrinking the buffer
|
10023
|
+
if (!lctx.buf_output || prev_size < new_size) {
|
10024
|
+
if (lctx.buf_output) {
|
10025
|
+
#ifndef NDEBUG
|
10026
|
+
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
10027
|
+
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
10028
|
+
#endif
|
10029
|
+
ggml_backend_buffer_free(lctx.buf_output);
|
10030
|
+
lctx.buf_output = nullptr;
|
10031
|
+
lctx.logits = nullptr;
|
10032
|
+
lctx.embd = nullptr;
|
10033
|
+
}
|
10034
|
+
|
10035
|
+
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
10036
|
+
if (lctx.buf_output == nullptr) {
|
10037
|
+
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
10038
|
+
return 0;
|
10039
|
+
}
|
10040
|
+
}
|
10041
|
+
|
10042
|
+
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
|
10043
|
+
|
10044
|
+
lctx.logits = has_logits ? output_base : nullptr;
|
10045
|
+
lctx.embd = has_embd ? output_base + logits_size : nullptr;
|
10046
|
+
|
10047
|
+
lctx.output_size = n_outputs_max;
|
10048
|
+
lctx.logits_size = logits_size;
|
10049
|
+
lctx.embd_size = embd_size;
|
10050
|
+
|
10051
|
+
// set all ids as invalid (negative)
|
10052
|
+
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
|
10053
|
+
|
10054
|
+
ggml_backend_buffer_clear(lctx.buf_output, 0);
|
10055
|
+
|
10056
|
+
lctx.n_outputs = 0;
|
10057
|
+
|
10058
|
+
return n_outputs_max;
|
10059
|
+
}
|
10060
|
+
|
10061
|
+
|
8732
10062
|
static void llama_graph_compute(
|
8733
10063
|
llama_context & lctx,
|
8734
10064
|
ggml_cgraph * gf,
|
@@ -8804,16 +10134,8 @@ static int llama_decode_internal(
|
|
8804
10134
|
const int64_t n_embd = hparams.n_embd;
|
8805
10135
|
const int64_t n_vocab = hparams.n_vocab;
|
8806
10136
|
|
8807
|
-
|
8808
|
-
|
8809
|
-
|
8810
|
-
#ifndef NDEBUG
|
8811
|
-
auto & logits_valid = lctx.logits_valid;
|
8812
|
-
logits_valid.clear();
|
8813
|
-
logits_valid.resize(n_tokens_all);
|
8814
|
-
|
8815
|
-
memset(logits_out, 0, lctx.logits_size*sizeof(float));
|
8816
|
-
#endif
|
10137
|
+
uint32_t n_outputs = 0;
|
10138
|
+
uint32_t n_outputs_prev = 0;
|
8817
10139
|
|
8818
10140
|
const auto n_ubatch = cparams.n_ubatch;
|
8819
10141
|
|
@@ -8822,6 +10144,38 @@ static int llama_decode_internal(
|
|
8822
10144
|
std::vector<llama_seq_id *> seq_id_arr;
|
8823
10145
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
8824
10146
|
|
10147
|
+
// count outputs
|
10148
|
+
if (batch_all.logits) {
|
10149
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10150
|
+
n_outputs += batch_all.logits[i] != 0;
|
10151
|
+
}
|
10152
|
+
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
10153
|
+
n_outputs = n_tokens_all;
|
10154
|
+
} else {
|
10155
|
+
// keep last output only
|
10156
|
+
n_outputs = 1;
|
10157
|
+
}
|
10158
|
+
|
10159
|
+
// reserve output buffer
|
10160
|
+
if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
|
10161
|
+
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
|
10162
|
+
return -2;
|
10163
|
+
};
|
10164
|
+
|
10165
|
+
// set output mappings
|
10166
|
+
if (batch_all.logits) {
|
10167
|
+
int32_t i_logits = 0;
|
10168
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10169
|
+
if (batch_all.logits[i]) {
|
10170
|
+
lctx.output_ids[i] = i_logits++;
|
10171
|
+
}
|
10172
|
+
}
|
10173
|
+
} else {
|
10174
|
+
for (uint32_t i = 0; i < n_outputs; ++i) {
|
10175
|
+
lctx.output_ids[i] = i;
|
10176
|
+
}
|
10177
|
+
}
|
10178
|
+
|
8825
10179
|
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
8826
10180
|
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
8827
10181
|
llama_batch u_batch = {
|
@@ -8837,6 +10191,27 @@ static int llama_decode_internal(
|
|
8837
10191
|
/* .all_seq_id = */ batch_all.all_seq_id,
|
8838
10192
|
};
|
8839
10193
|
|
10194
|
+
// count the outputs in this u_batch
|
10195
|
+
{
|
10196
|
+
int32_t n_outputs_new = 0;
|
10197
|
+
|
10198
|
+
if (u_batch.logits) {
|
10199
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
10200
|
+
n_outputs_new += u_batch.logits[i] != 0;
|
10201
|
+
}
|
10202
|
+
} else if (n_outputs == n_tokens_all) {
|
10203
|
+
n_outputs_new = n_tokens;
|
10204
|
+
} else {
|
10205
|
+
// keep last output only
|
10206
|
+
if (cur_token + n_tokens >= n_tokens_all) {
|
10207
|
+
n_outputs_new = 1;
|
10208
|
+
}
|
10209
|
+
}
|
10210
|
+
|
10211
|
+
// needs to happen before the graph is built
|
10212
|
+
lctx.n_outputs = n_outputs_new;
|
10213
|
+
}
|
10214
|
+
|
8840
10215
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
8841
10216
|
GGML_ASSERT(n_threads > 0);
|
8842
10217
|
|
@@ -8900,23 +10275,37 @@ static int llama_decode_internal(
|
|
8900
10275
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
8901
10276
|
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
8902
10277
|
|
8903
|
-
if (
|
10278
|
+
if (lctx.n_outputs == 0) {
|
10279
|
+
// no output
|
10280
|
+
res = nullptr;
|
10281
|
+
embd = nullptr;
|
10282
|
+
} else if (!hparams.causal_attn) {
|
8904
10283
|
res = nullptr; // do not extract logits for embedding models such as BERT
|
8905
10284
|
|
8906
10285
|
// token or sequence embeddings
|
8907
10286
|
embd = gf->nodes[gf->n_nodes - 1];
|
8908
10287
|
|
8909
10288
|
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
8910
|
-
} else {
|
8911
|
-
|
8912
|
-
|
8913
|
-
|
8914
|
-
|
8915
|
-
|
8916
|
-
|
8917
|
-
}
|
8918
|
-
|
10289
|
+
} else if (cparams.embeddings) {
|
10290
|
+
// the embeddings could be in the second to last tensor, or any of the previous tensors
|
10291
|
+
int i_embd = gf->n_nodes - 2;
|
10292
|
+
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
|
10293
|
+
i_embd = gf->n_nodes - i;
|
10294
|
+
if (i_embd < 0) { break; }
|
10295
|
+
embd = gf->nodes[i_embd];
|
10296
|
+
}
|
10297
|
+
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
10298
|
+
|
10299
|
+
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
10300
|
+
if (!cparams.causal_attn) {
|
10301
|
+
res = nullptr; // do not extract logits when not needed
|
10302
|
+
// skip computing logits
|
10303
|
+
// TODO: is this safe?
|
10304
|
+
gf->n_nodes = i_embd + 1;
|
8919
10305
|
}
|
10306
|
+
} else {
|
10307
|
+
embd = nullptr; // do not extract embeddings when not needed
|
10308
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
8920
10309
|
}
|
8921
10310
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
8922
10311
|
|
@@ -8959,67 +10348,38 @@ static int llama_decode_internal(
|
|
8959
10348
|
//}
|
8960
10349
|
|
8961
10350
|
// extract logits
|
8962
|
-
// TODO: do not compute and extract logits if only embeddings are needed
|
8963
|
-
// update the graphs to skip "result_output" if logits are not needed
|
8964
10351
|
if (res) {
|
8965
10352
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
8966
10353
|
GGML_ASSERT(backend_res != nullptr);
|
8967
|
-
|
8968
|
-
|
8969
|
-
|
8970
|
-
|
8971
|
-
|
8972
|
-
|
8973
|
-
|
8974
|
-
|
8975
|
-
|
8976
|
-
// extract logits for the range [i_first, i_last)
|
8977
|
-
// group the requests to minimize the number of calls to the backend
|
8978
|
-
ggml_backend_tensor_get_async(backend_res, res,
|
8979
|
-
logits_out + n_vocab*(cur_token + i_first),
|
8980
|
-
i_first*n_vocab*sizeof(float),
|
8981
|
-
(i_last - i_first)*n_vocab*sizeof(float));
|
8982
|
-
i_first = -1;
|
8983
|
-
}
|
8984
|
-
}
|
8985
|
-
#ifndef NDEBUG
|
8986
|
-
logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
|
8987
|
-
#endif
|
8988
|
-
}
|
8989
|
-
} else if (lctx.logits_all) {
|
8990
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
|
8991
|
-
#ifndef NDEBUG
|
8992
|
-
std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
|
8993
|
-
#endif
|
8994
|
-
} else {
|
8995
|
-
if (cur_token + n_tokens >= n_tokens_all) {
|
8996
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
|
8997
|
-
#ifndef NDEBUG
|
8998
|
-
logits_valid[0] = true;
|
8999
|
-
#endif
|
9000
|
-
}
|
10354
|
+
GGML_ASSERT(lctx.logits != nullptr);
|
10355
|
+
|
10356
|
+
float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
|
10357
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10358
|
+
|
10359
|
+
if (n_outputs_new) {
|
10360
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10361
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
|
10362
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
|
9001
10363
|
}
|
9002
10364
|
}
|
9003
10365
|
|
9004
10366
|
// extract embeddings
|
9005
|
-
if (
|
10367
|
+
if (embd) {
|
9006
10368
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
9007
10369
|
GGML_ASSERT(backend_embd != nullptr);
|
9008
10370
|
|
9009
10371
|
switch (cparams.pooling_type) {
|
9010
10372
|
case LLAMA_POOLING_TYPE_NONE:
|
9011
|
-
{
|
9012
|
-
// extract token embeddings
|
9013
|
-
|
9014
|
-
|
9015
|
-
|
9016
|
-
|
9017
|
-
|
9018
|
-
|
9019
|
-
|
9020
|
-
|
9021
|
-
ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
9022
|
-
}
|
10373
|
+
{
|
10374
|
+
// extract token embeddings
|
10375
|
+
GGML_ASSERT(lctx.embd != nullptr);
|
10376
|
+
float * embd_out = lctx.embd + n_outputs_prev*n_embd;
|
10377
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10378
|
+
|
10379
|
+
if (n_outputs_new) {
|
10380
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10381
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
|
10382
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
9023
10383
|
}
|
9024
10384
|
} break;
|
9025
10385
|
case LLAMA_POOLING_TYPE_CLS:
|
@@ -9046,6 +10406,7 @@ static int llama_decode_internal(
|
|
9046
10406
|
} break;
|
9047
10407
|
}
|
9048
10408
|
}
|
10409
|
+
n_outputs_prev += lctx.n_outputs;
|
9049
10410
|
}
|
9050
10411
|
|
9051
10412
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
@@ -9976,7 +11337,7 @@ struct llm_tokenizer_wpm {
|
|
9976
11337
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
9977
11338
|
continue;
|
9978
11339
|
}
|
9979
|
-
code =
|
11340
|
+
code = unicode_tolower(code);
|
9980
11341
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
9981
11342
|
code = ' ';
|
9982
11343
|
}
|
@@ -9996,7 +11357,7 @@ struct llm_tokenizer_wpm {
|
|
9996
11357
|
std::vector<std::string> words;
|
9997
11358
|
while (r < new_str.size()) {
|
9998
11359
|
// if is whitespace
|
9999
|
-
if (isspace(new_str[r])) {
|
11360
|
+
if (isspace(new_str[r], std::locale::classic())) {
|
10000
11361
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
10001
11362
|
l = r + 1;
|
10002
11363
|
r = l;
|
@@ -10010,18 +11371,12 @@ struct llm_tokenizer_wpm {
|
|
10010
11371
|
return words;
|
10011
11372
|
}
|
10012
11373
|
|
10013
|
-
uint32_t to_lower(uint32_t code) {
|
10014
|
-
static const std::locale locale("en_US.UTF-8");
|
10015
|
-
#if defined(_WIN32)
|
10016
|
-
if (code > 0xFFFF) {
|
10017
|
-
return code;
|
10018
|
-
}
|
10019
|
-
#endif
|
10020
|
-
return std::tolower(wchar_t(code), locale);
|
10021
|
-
}
|
10022
|
-
|
10023
11374
|
bool is_ascii_punct(uint32_t code) {
|
10024
|
-
|
11375
|
+
if (code > 0xFF) {
|
11376
|
+
return false;
|
11377
|
+
}
|
11378
|
+
auto c = char(static_cast<unsigned char>(code));
|
11379
|
+
return ispunct(c, std::locale::classic());
|
10025
11380
|
}
|
10026
11381
|
|
10027
11382
|
bool is_chinese_char(uint32_t cpt) {
|
@@ -10266,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10266
11621
|
// grammar - internal
|
10267
11622
|
//
|
10268
11623
|
|
10269
|
-
struct llama_partial_utf8 {
|
10270
|
-
uint32_t value; // bit value so far (unshifted)
|
10271
|
-
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
10272
|
-
};
|
10273
|
-
|
10274
|
-
struct llama_grammar {
|
10275
|
-
const std::vector<std::vector<llama_grammar_element>> rules;
|
10276
|
-
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
10277
|
-
|
10278
|
-
// buffer for partially generated UTF-8 sequence from accepted tokens
|
10279
|
-
llama_partial_utf8 partial_utf8;
|
10280
|
-
};
|
10281
|
-
|
10282
|
-
struct llama_grammar_candidate {
|
10283
|
-
size_t index;
|
10284
|
-
const uint32_t * code_points;
|
10285
|
-
llama_partial_utf8 partial_utf8;
|
10286
|
-
};
|
10287
11624
|
|
10288
11625
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
10289
11626
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
10290
|
-
|
11627
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
10291
11628
|
const std::string & src,
|
10292
11629
|
llama_partial_utf8 partial_start) {
|
10293
11630
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -10489,7 +11826,7 @@ static void llama_grammar_advance_stack(
|
|
10489
11826
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
10490
11827
|
// produces the N possible stacks if the given char is accepted at those
|
10491
11828
|
// positions
|
10492
|
-
|
11829
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
10493
11830
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
10494
11831
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
10495
11832
|
const uint32_t chr) {
|
@@ -11715,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11715
13052
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
11716
13053
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
11717
13054
|
// tensor name.
|
11718
|
-
n_layer /= n_expert;
|
11719
13055
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
11720
13056
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
11721
13057
|
}
|
@@ -11729,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11729
13065
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
11730
13066
|
// with the quantization of the output tensor
|
11731
13067
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
11732
|
-
|
11733
|
-
|
11734
|
-
|
11735
|
-
|
11736
|
-
|
11737
|
-
|
11738
|
-
|
11739
|
-
|
11740
|
-
|
11741
|
-
|
13068
|
+
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
13069
|
+
new_type = qs.params->output_tensor_type;
|
13070
|
+
} else {
|
13071
|
+
int nx = tensor->ne[0];
|
13072
|
+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
13073
|
+
new_type = GGML_TYPE_Q8_0;
|
13074
|
+
}
|
13075
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
13076
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
13077
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13078
|
+
new_type = GGML_TYPE_Q5_K;
|
13079
|
+
}
|
13080
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
13081
|
+
new_type = GGML_TYPE_Q6_K;
|
13082
|
+
}
|
11742
13083
|
}
|
11743
13084
|
} else if (name == "token_embd.weight") {
|
11744
|
-
if (
|
11745
|
-
|
11746
|
-
|
11747
|
-
|
11748
|
-
|
11749
|
-
|
11750
|
-
|
11751
|
-
|
11752
|
-
|
13085
|
+
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
13086
|
+
new_type = qs.params->token_embedding_type;
|
13087
|
+
} else {
|
13088
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
13089
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13090
|
+
new_type = GGML_TYPE_Q2_K;
|
13091
|
+
}
|
13092
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13093
|
+
new_type = GGML_TYPE_IQ3_S;
|
13094
|
+
}
|
13095
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
13096
|
+
new_type = GGML_TYPE_IQ3_S;
|
13097
|
+
}
|
11753
13098
|
}
|
11754
13099
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
11755
|
-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13100
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
11756
13101
|
if (name.find("attn_v.weight") != std::string::npos) {
|
11757
13102
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
11758
13103
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
@@ -11771,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11771
13116
|
if (qs.model.hparams.n_expert == 8) {
|
11772
13117
|
new_type = GGML_TYPE_Q5_K;
|
11773
13118
|
} else {
|
11774
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
13119
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
|
11775
13120
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
11776
13121
|
}
|
11777
13122
|
}
|
@@ -11785,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11785
13130
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
11786
13131
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
11787
13132
|
}
|
11788
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
11789
|
-
new_type = GGML_TYPE_Q4_K;
|
11790
|
-
}
|
11791
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11792
|
-
new_type = GGML_TYPE_Q4_K;
|
11793
|
-
}
|
11794
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
13133
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
11795
13134
|
new_type = GGML_TYPE_Q4_K;
|
11796
13135
|
}
|
11797
13136
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
@@ -11944,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11944
13283
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
11945
13284
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
11946
13285
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
11947
|
-
new_type == GGML_TYPE_IQ3_XXS ||
|
13286
|
+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
13287
|
+
new_type == GGML_TYPE_IQ1_M) {
|
11948
13288
|
int nx = tensor->ne[0];
|
11949
13289
|
int ny = tensor->ne[1];
|
11950
13290
|
if (nx % QK_K != 0) {
|
@@ -11962,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11962
13302
|
case GGML_TYPE_IQ3_XXS:
|
11963
13303
|
case GGML_TYPE_IQ3_S:
|
11964
13304
|
case GGML_TYPE_IQ1_S:
|
13305
|
+
case GGML_TYPE_IQ1_M:
|
11965
13306
|
case GGML_TYPE_Q2_K:
|
11966
13307
|
case GGML_TYPE_Q3_K:
|
11967
13308
|
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
@@ -12043,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12043
13384
|
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
|
12044
13385
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
|
12045
13386
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
|
13387
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
|
12046
13388
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
|
12047
13389
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
12048
13390
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
@@ -12065,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12065
13407
|
constexpr bool use_mmap = false;
|
12066
13408
|
#endif
|
12067
13409
|
|
12068
|
-
|
12069
|
-
|
13410
|
+
llama_model_kv_override * kv_overrides = nullptr;
|
13411
|
+
if (params->kv_overrides) {
|
13412
|
+
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
13413
|
+
kv_overrides = v->data();
|
13414
|
+
}
|
13415
|
+
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
13416
|
+
ml.init_mappings(false); // no prefetching
|
12070
13417
|
|
12071
13418
|
llama_model model;
|
12072
13419
|
llm_load_arch(ml, model);
|
@@ -12090,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12090
13437
|
struct gguf_context * ctx_out = gguf_init_empty();
|
12091
13438
|
|
12092
13439
|
// copy the KV pairs from the input file
|
12093
|
-
gguf_set_kv (ctx_out, ml.
|
13440
|
+
gguf_set_kv (ctx_out, ml.meta);
|
12094
13441
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
12095
13442
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
12096
13443
|
|
13444
|
+
if (params->kv_overrides) {
|
13445
|
+
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
13446
|
+
for (auto & o : overrides) {
|
13447
|
+
if (o.key[0] == 0) break;
|
13448
|
+
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
13449
|
+
gguf_set_val_f32(ctx_out, o.key, o.float_value);
|
13450
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
13451
|
+
gguf_set_val_i32(ctx_out, o.key, o.int_value);
|
13452
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
13453
|
+
gguf_set_val_bool(ctx_out, o.key, o.bool_value);
|
13454
|
+
} else {
|
13455
|
+
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
13456
|
+
}
|
13457
|
+
}
|
13458
|
+
}
|
13459
|
+
|
12097
13460
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12098
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13461
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12099
13462
|
|
12100
13463
|
const std::string name = ggml_get_name(meta);
|
12101
13464
|
|
12102
13465
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
12103
13466
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
12104
13467
|
++qs.n_attention_wv;
|
12105
|
-
}
|
12106
|
-
else if (name.find("ffn_down") != std::string::npos) {
|
12107
|
-
++qs.n_ffn_down;
|
12108
|
-
}
|
12109
|
-
else if (name.find("ffn_gate") != std::string::npos) {
|
12110
|
-
++qs.n_ffn_gate;
|
12111
|
-
}
|
12112
|
-
else if (name.find("ffn_up") != std::string::npos) {
|
12113
|
-
++qs.n_ffn_up;
|
12114
|
-
}
|
12115
|
-
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13468
|
+
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
12116
13469
|
qs.has_output = true;
|
12117
13470
|
}
|
12118
13471
|
}
|
12119
|
-
|
12120
|
-
|
12121
|
-
|
12122
|
-
|
13472
|
+
|
13473
|
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
|
+
|
13475
|
+
// sanity checks
|
13476
|
+
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
12123
13477
|
|
12124
13478
|
size_t total_size_org = 0;
|
12125
13479
|
size_t total_size_new = 0;
|
@@ -12135,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12135
13489
|
|
12136
13490
|
// populate the original tensors so we get an initial meta data
|
12137
13491
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12138
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13492
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12139
13493
|
gguf_add_tensor(ctx_out, meta);
|
12140
13494
|
}
|
12141
13495
|
|
@@ -12149,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12149
13503
|
// placeholder for the meta data
|
12150
13504
|
::zeros(fout, meta_size);
|
12151
13505
|
|
13506
|
+
const auto tn = LLM_TN(model.arch);
|
13507
|
+
|
12152
13508
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12153
13509
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
12154
13510
|
|
@@ -12171,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12171
13527
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
12172
13528
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
12173
13529
|
|
12174
|
-
// quantize only 2D tensors
|
12175
|
-
quantize &= (ggml_n_dims(tensor)
|
13530
|
+
// quantize only 2D and 3D tensors (experts)
|
13531
|
+
quantize &= (ggml_n_dims(tensor) >= 2);
|
12176
13532
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
12177
13533
|
quantize &= !params->only_copy;
|
12178
13534
|
|
@@ -12201,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12201
13557
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
12202
13558
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
12203
13559
|
}
|
13560
|
+
else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
|
+
new_type = params->token_embedding_type;
|
13562
|
+
}
|
13563
|
+
else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
|
+
new_type = params->output_tensor_type;
|
13565
|
+
}
|
12204
13566
|
|
12205
13567
|
// If we've decided to quantize to the same type the tensor is already
|
12206
13568
|
// in then there's nothing to do.
|
@@ -12221,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12221
13583
|
if (it == imatrix_data->end()) {
|
12222
13584
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
12223
13585
|
} else {
|
12224
|
-
if (it->second.size() == (size_t)tensor->ne[0]) {
|
13586
|
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
12225
13587
|
imatrix = it->second.data();
|
12226
13588
|
} else {
|
12227
13589
|
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
12228
|
-
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
13590
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
13591
|
+
|
13592
|
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
13593
|
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
13594
|
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
13595
|
+
// tok_embd should be ignored in this case, since it always causes this warning
|
13596
|
+
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
13597
|
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
13598
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
13599
|
+
}
|
12229
13600
|
}
|
12230
13601
|
}
|
12231
13602
|
}
|
@@ -12233,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12233
13604
|
new_type == GGML_TYPE_IQ2_XS ||
|
12234
13605
|
new_type == GGML_TYPE_IQ2_S ||
|
12235
13606
|
new_type == GGML_TYPE_IQ1_S ||
|
13607
|
+
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
12236
13608
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
12237
13609
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
12238
13610
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -12261,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12261
13633
|
new_data = work.data();
|
12262
13634
|
|
12263
13635
|
const int n_per_row = tensor->ne[0];
|
12264
|
-
const int nrows =
|
13636
|
+
const int nrows = tensor->ne[1];
|
12265
13637
|
|
12266
13638
|
static const int min_chunk_size = 32 * 512;
|
12267
13639
|
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
12268
13640
|
|
12269
|
-
const int
|
13641
|
+
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
13642
|
+
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
12270
13643
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
12271
|
-
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
|
12272
13644
|
|
13645
|
+
// quantize each expert separately since they have different importance matrices
|
13646
|
+
new_size = 0;
|
13647
|
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
13648
|
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
13649
|
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
13650
|
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
13651
|
+
|
13652
|
+
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
13653
|
+
}
|
12273
13654
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
12274
13655
|
}
|
12275
13656
|
total_size_org += ggml_nbytes(tensor);
|
@@ -12340,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12340
13721
|
if (path_base_model) {
|
12341
13722
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
12342
13723
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
12343
|
-
ml->
|
13724
|
+
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
12344
13725
|
}
|
12345
13726
|
|
12346
13727
|
struct tensor_meta {
|
@@ -12461,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12461
13842
|
|
12462
13843
|
ggml_tensor * base_t;
|
12463
13844
|
if (ml) {
|
12464
|
-
if (
|
13845
|
+
if (!ml->get_tensor_meta(base_name.c_str())) {
|
12465
13846
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
12466
13847
|
return 1;
|
12467
13848
|
}
|
@@ -12645,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12645
14026
|
struct llama_model_quantize_params result = {
|
12646
14027
|
/*.nthread =*/ 0,
|
12647
14028
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
14029
|
+
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
14030
|
+
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
12648
14031
|
/*.allow_requantize =*/ false,
|
12649
14032
|
/*.quantize_output_tensor =*/ true,
|
12650
14033
|
/*.only_copy =*/ false,
|
12651
14034
|
/*.pure =*/ false,
|
12652
14035
|
/*.imatrix =*/ nullptr,
|
14036
|
+
/*.kv_overrides =*/ nullptr,
|
12653
14037
|
};
|
12654
14038
|
|
12655
14039
|
return result;
|
@@ -12658,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12658
14042
|
size_t llama_max_devices(void) {
|
12659
14043
|
#if defined(GGML_USE_METAL)
|
12660
14044
|
return 1;
|
12661
|
-
#elif defined(
|
14045
|
+
#elif defined(GGML_USE_CUDA)
|
12662
14046
|
return GGML_CUDA_MAX_DEVICES;
|
12663
14047
|
#elif defined(GGML_USE_SYCL)
|
12664
14048
|
return GGML_SYCL_MAX_DEVICES;
|
@@ -12678,8 +14062,8 @@ bool llama_supports_mlock(void) {
|
|
12678
14062
|
}
|
12679
14063
|
|
12680
14064
|
bool llama_supports_gpu_offload(void) {
|
12681
|
-
#if defined(
|
12682
|
-
defined(GGML_USE_SYCL)
|
14065
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
14066
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
12683
14067
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
12684
14068
|
return true;
|
12685
14069
|
#else
|
@@ -12786,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
|
|
12786
14170
|
const auto & hparams = model->hparams;
|
12787
14171
|
auto & cparams = ctx->cparams;
|
12788
14172
|
|
12789
|
-
|
14173
|
+
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
12790
14174
|
cparams.n_threads = params.n_threads;
|
12791
14175
|
cparams.n_threads_batch = params.n_threads_batch;
|
12792
14176
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
@@ -12802,6 +14186,9 @@ struct llama_context * llama_new_context_with_model(
|
|
12802
14186
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
12803
14187
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
12804
14188
|
|
14189
|
+
// this is necessary due to kv_self.n being padded later during inference
|
14190
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
|
14191
|
+
|
12805
14192
|
// with causal attention, the batch size is limited by the context size
|
12806
14193
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
12807
14194
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
@@ -12881,32 +14268,43 @@ struct llama_context * llama_new_context_with_model(
|
|
12881
14268
|
}
|
12882
14269
|
ctx->backends.push_back(ctx->backend_metal);
|
12883
14270
|
}
|
12884
|
-
#elif defined(
|
12885
|
-
if (model->
|
14271
|
+
#elif defined(GGML_USE_CUDA)
|
14272
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
12886
14273
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12887
|
-
|
12888
|
-
|
14274
|
+
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
14275
|
+
if (backend == nullptr) {
|
14276
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
14277
|
+
llama_free(ctx);
|
14278
|
+
return nullptr;
|
14279
|
+
}
|
14280
|
+
ctx->backends.push_back(backend);
|
14281
|
+
} else {
|
14282
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
14283
|
+
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
14284
|
+
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
12889
14285
|
if (backend == nullptr) {
|
12890
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__,
|
14286
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
12891
14287
|
llama_free(ctx);
|
12892
14288
|
return nullptr;
|
12893
14289
|
}
|
12894
14290
|
ctx->backends.push_back(backend);
|
12895
|
-
} else {
|
12896
|
-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
12897
|
-
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
12898
|
-
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
12899
|
-
if (backend == nullptr) {
|
12900
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
12901
|
-
llama_free(ctx);
|
12902
|
-
return nullptr;
|
12903
|
-
}
|
12904
|
-
ctx->backends.push_back(backend);
|
12905
|
-
}
|
12906
14291
|
}
|
12907
14292
|
}
|
12908
14293
|
#elif defined(GGML_USE_VULKAN)
|
12909
|
-
if (model->
|
14294
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14295
|
+
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
14296
|
+
llama_free(ctx);
|
14297
|
+
return nullptr;
|
14298
|
+
}
|
14299
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
14300
|
+
ggml_backend_t backend = ggml_backend_vk_init(0);
|
14301
|
+
if (backend == nullptr) {
|
14302
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
14303
|
+
llama_free(ctx);
|
14304
|
+
return nullptr;
|
14305
|
+
}
|
14306
|
+
ctx->backends.push_back(backend);
|
14307
|
+
} else {
|
12910
14308
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
12911
14309
|
ggml_backend_t backend = ggml_backend_vk_init(device);
|
12912
14310
|
if (backend == nullptr) {
|
@@ -12918,31 +14316,28 @@ struct llama_context * llama_new_context_with_model(
|
|
12918
14316
|
}
|
12919
14317
|
}
|
12920
14318
|
#elif defined(GGML_USE_SYCL)
|
12921
|
-
|
12922
|
-
|
12923
|
-
|
12924
|
-
|
12925
|
-
|
14319
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
14320
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14321
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
14322
|
+
if (backend == nullptr) {
|
14323
|
+
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
14324
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
14325
|
+
llama_free(ctx);
|
14326
|
+
return nullptr;
|
14327
|
+
}
|
14328
|
+
ctx->backends.push_back(backend);
|
14329
|
+
} else {
|
14330
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
14331
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
14332
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12926
14333
|
if (backend == nullptr) {
|
12927
|
-
|
14334
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
14335
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
14336
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
12928
14337
|
llama_free(ctx);
|
12929
14338
|
return nullptr;
|
12930
14339
|
}
|
12931
14340
|
ctx->backends.push_back(backend);
|
12932
|
-
} else {
|
12933
|
-
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
12934
|
-
int id_list[GGML_SYCL_MAX_DEVICES];
|
12935
|
-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
12936
|
-
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
12937
|
-
int device_id = id_list[i];
|
12938
|
-
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12939
|
-
if (backend == nullptr) {
|
12940
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
12941
|
-
llama_free(ctx);
|
12942
|
-
return nullptr;
|
12943
|
-
}
|
12944
|
-
ctx->backends.push_back(backend);
|
12945
|
-
}
|
12946
14341
|
}
|
12947
14342
|
}
|
12948
14343
|
#elif defined(GGML_USE_KOMPUTE)
|
@@ -12990,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
|
|
12990
14385
|
|
12991
14386
|
// graph outputs buffer
|
12992
14387
|
{
|
12993
|
-
// resized during inference
|
12994
|
-
ctx
|
12995
|
-
|
12996
|
-
|
12997
|
-
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
|
12998
|
-
|
12999
|
-
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
|
13000
|
-
if (ctx->buf_output == nullptr) {
|
13001
|
-
LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
|
14388
|
+
// resized during inference when a batch uses more outputs
|
14389
|
+
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
14390
|
+
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
13002
14391
|
llama_free(ctx);
|
13003
14392
|
return nullptr;
|
13004
14393
|
}
|
13005
|
-
ggml_backend_buffer_clear(ctx->buf_output, 0);
|
13006
|
-
|
13007
|
-
|
13008
|
-
ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
|
13009
|
-
if (params.embeddings) {
|
13010
|
-
ctx->embd = ctx->logits + ctx->logits_size;
|
13011
|
-
}
|
13012
14394
|
|
13013
14395
|
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
|
13014
14396
|
ggml_backend_buffer_name(ctx->buf_output),
|
@@ -13033,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13033
14415
|
|
13034
14416
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
13035
14417
|
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
13036
|
-
#ifndef
|
14418
|
+
#ifndef GGML_USE_CUDA
|
13037
14419
|
// pipeline parallelism requires support for async compute and events
|
13038
14420
|
// currently this is only implemented in the CUDA backend
|
13039
14421
|
pipeline_parallel = false;
|
@@ -13061,14 +14443,17 @@ struct llama_context * llama_new_context_with_model(
|
|
13061
14443
|
ggml_backend_t backend = ctx->backends[i];
|
13062
14444
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
13063
14445
|
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
13064
|
-
|
13065
|
-
|
13066
|
-
|
14446
|
+
if (size > 1) {
|
14447
|
+
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
14448
|
+
ggml_backend_buft_name(buft),
|
14449
|
+
size / 1024.0 / 1024.0);
|
14450
|
+
}
|
13067
14451
|
}
|
13068
14452
|
|
13069
14453
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
13070
14454
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
13071
|
-
LLAMA_LOG_INFO("%s: graph
|
14455
|
+
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
14456
|
+
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
13072
14457
|
}
|
13073
14458
|
}
|
13074
14459
|
|
@@ -13138,10 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
13138
14523
|
case LLM_ARCH_ORION:
|
13139
14524
|
case LLM_ARCH_INTERNLM2:
|
13140
14525
|
case LLM_ARCH_MINICPM:
|
14526
|
+
case LLM_ARCH_XVERSE:
|
14527
|
+
case LLM_ARCH_COMMAND_R:
|
13141
14528
|
return LLAMA_ROPE_TYPE_NORM;
|
13142
14529
|
|
13143
14530
|
// the pairs of head values are offset by n_rot/2
|
13144
14531
|
case LLM_ARCH_FALCON:
|
14532
|
+
case LLM_ARCH_GROK:
|
13145
14533
|
case LLM_ARCH_PERSIMMON:
|
13146
14534
|
case LLM_ARCH_BERT:
|
13147
14535
|
case LLM_ARCH_NOMIC_BERT:
|
@@ -13174,6 +14562,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
|
|
13174
14562
|
return model->hparams.n_embd;
|
13175
14563
|
}
|
13176
14564
|
|
14565
|
+
int32_t llama_n_layer(const struct llama_model * model) {
|
14566
|
+
return model->hparams.n_layer;
|
14567
|
+
}
|
14568
|
+
|
13177
14569
|
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
13178
14570
|
return model->hparams.rope_freq_scale_train;
|
13179
14571
|
}
|
@@ -13273,6 +14665,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
|
|
13273
14665
|
}
|
13274
14666
|
}
|
13275
14667
|
|
14668
|
+
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
14669
|
+
GGML_ASSERT(cvec.tensors.empty());
|
14670
|
+
GGML_ASSERT(cvec.ctxs.empty());
|
14671
|
+
GGML_ASSERT(cvec.bufs.empty());
|
14672
|
+
|
14673
|
+
// count layer buffer types
|
14674
|
+
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
14675
|
+
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
|
14676
|
+
buft_layer_count[model.buft_layer[i].buft]++;
|
14677
|
+
}
|
14678
|
+
|
14679
|
+
// allocate contexts
|
14680
|
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
14681
|
+
for (auto & it : buft_layer_count) {
|
14682
|
+
int n_layers = it.second;
|
14683
|
+
struct ggml_init_params params = {
|
14684
|
+
/*.mem_size =*/ n_layers * ggml_tensor_overhead(),
|
14685
|
+
/*.mem_buffer =*/ NULL,
|
14686
|
+
/*.no_alloc =*/ true,
|
14687
|
+
};
|
14688
|
+
ggml_context * ctx = ggml_init(params);
|
14689
|
+
if (!ctx) {
|
14690
|
+
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
14691
|
+
return 1;
|
14692
|
+
}
|
14693
|
+
ctx_map[it.first] = ctx;
|
14694
|
+
}
|
14695
|
+
|
14696
|
+
// make tensors
|
14697
|
+
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
14698
|
+
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
14699
|
+
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
14700
|
+
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
14701
|
+
cvec.tensors.push_back(tensor);
|
14702
|
+
}
|
14703
|
+
|
14704
|
+
// allocate tensors / buffers and zero
|
14705
|
+
for (auto it : ctx_map) {
|
14706
|
+
ggml_backend_buffer_type_t buft = it.first;
|
14707
|
+
ggml_context * ctx = it.second;
|
14708
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
14709
|
+
if (!buf) {
|
14710
|
+
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
|
14711
|
+
return false;
|
14712
|
+
}
|
14713
|
+
ggml_backend_buffer_clear(buf, 0);
|
14714
|
+
cvec.ctxs.push_back(ctx);
|
14715
|
+
cvec.bufs.push_back(buf);
|
14716
|
+
}
|
14717
|
+
|
14718
|
+
return true;
|
14719
|
+
}
|
14720
|
+
|
14721
|
+
int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
|
14722
|
+
const llama_model & model = lctx->model;
|
14723
|
+
llama_control_vector & cvec = lctx->cvec;
|
14724
|
+
|
14725
|
+
if (data == nullptr) {
|
14726
|
+
// disable the current control vector (but leave allocated for later)
|
14727
|
+
cvec.layer_start = -1;
|
14728
|
+
cvec.layer_end = -1;
|
14729
|
+
return 0;
|
14730
|
+
}
|
14731
|
+
|
14732
|
+
if (n_embd != (int) model.hparams.n_embd) {
|
14733
|
+
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
14734
|
+
return 1;
|
14735
|
+
}
|
14736
|
+
|
14737
|
+
if (cvec.tensors.empty()) {
|
14738
|
+
if (!llama_control_vector_init(cvec, model)) {
|
14739
|
+
return 1;
|
14740
|
+
}
|
14741
|
+
}
|
14742
|
+
|
14743
|
+
cvec.layer_start = il_start;
|
14744
|
+
cvec.layer_end = il_end;
|
14745
|
+
|
14746
|
+
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
14747
|
+
assert(cvec.tensors[il] != nullptr);
|
14748
|
+
|
14749
|
+
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
14750
|
+
if (off + n_embd <= len) {
|
14751
|
+
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
14752
|
+
}
|
14753
|
+
}
|
14754
|
+
|
14755
|
+
return 0;
|
14756
|
+
}
|
14757
|
+
|
13276
14758
|
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
13277
14759
|
struct llama_kv_cache_view result = {
|
13278
14760
|
/*.n_cells = */ 0,
|
@@ -13426,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
13426
14908
|
|
13427
14909
|
// Returns the *maximum* size of the state
|
13428
14910
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
14911
|
+
const auto & cparams = ctx->cparams;
|
14912
|
+
const auto & hparams = ctx->model.hparams;
|
14913
|
+
|
13429
14914
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
13430
14915
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
13431
14916
|
const size_t s_rng_size = sizeof(size_t);
|
13432
14917
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
14918
|
+
const size_t s_n_outputs = sizeof(size_t);
|
14919
|
+
// assume worst case for outputs although only currently set ones are serialized
|
14920
|
+
const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
|
13433
14921
|
const size_t s_logits_size = sizeof(size_t);
|
13434
|
-
|
13435
|
-
const size_t s_logits = ctx->logits_size * sizeof(float);
|
14922
|
+
const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
|
13436
14923
|
const size_t s_embedding_size = sizeof(size_t);
|
13437
|
-
const size_t s_embedding = ctx->embd_size * sizeof(float);
|
14924
|
+
const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
|
13438
14925
|
const size_t s_kv_buf_size = sizeof(size_t);
|
13439
14926
|
const size_t s_kv_head = sizeof(uint32_t);
|
13440
14927
|
const size_t s_kv_size = sizeof(uint32_t);
|
13441
14928
|
const size_t s_kv_used = sizeof(uint32_t);
|
13442
14929
|
const size_t s_kv = ctx->kv_self.total_size();
|
13443
|
-
|
13444
|
-
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
14930
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
13445
14931
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
13446
14932
|
|
13447
14933
|
const size_t s_total = (
|
13448
14934
|
+ s_rng_size
|
13449
14935
|
+ s_rng
|
14936
|
+
+ s_n_outputs
|
14937
|
+
+ s_output_pos
|
13450
14938
|
+ s_logits_size
|
13451
14939
|
+ s_logits
|
13452
14940
|
+ s_embedding_size
|
@@ -13521,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13521
15009
|
std::ostringstream rng_ss;
|
13522
15010
|
rng_ss << ctx->rng;
|
13523
15011
|
|
13524
|
-
const std::string & rng_str
|
15012
|
+
const std::string & rng_str = rng_ss.str();
|
13525
15013
|
const size_t rng_size = rng_str.size();
|
13526
15014
|
|
13527
15015
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
@@ -13530,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13530
15018
|
data_ctx->write(rng_str.data(), rng_size);
|
13531
15019
|
}
|
13532
15020
|
|
13533
|
-
// copy
|
15021
|
+
// copy outputs
|
13534
15022
|
{
|
13535
|
-
|
15023
|
+
// Can't use ctx->n_outputs because it's not for the
|
15024
|
+
// entire last batch when n_ubatch is smaller than n_batch
|
15025
|
+
size_t n_outputs = 0;
|
13536
15026
|
|
13537
|
-
|
15027
|
+
// copy output ids
|
15028
|
+
{
|
15029
|
+
std::vector<int32_t> output_pos;
|
13538
15030
|
|
13539
|
-
|
13540
|
-
|
15031
|
+
const size_t n_batch = ctx->cparams.n_batch;
|
15032
|
+
const auto & output_ids = ctx->output_ids;
|
15033
|
+
|
15034
|
+
output_pos.resize(ctx->output_size);
|
15035
|
+
|
15036
|
+
// build a more compact representation of the output ids
|
15037
|
+
for (size_t i = 0; i < n_batch; ++i) {
|
15038
|
+
// map an output id to a position in the batch
|
15039
|
+
int32_t pos = output_ids[i];
|
15040
|
+
if (pos >= 0) {
|
15041
|
+
if ((size_t) pos >= n_outputs) {
|
15042
|
+
n_outputs = pos + 1;
|
15043
|
+
}
|
15044
|
+
GGML_ASSERT((size_t) pos < ctx->output_size);
|
15045
|
+
output_pos[pos] = i;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
data_ctx->write(&n_outputs, sizeof(n_outputs));
|
15050
|
+
|
15051
|
+
if (n_outputs) {
|
15052
|
+
data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
|
15053
|
+
}
|
13541
15054
|
}
|
13542
|
-
}
|
13543
15055
|
|
13544
|
-
|
13545
|
-
|
13546
|
-
|
15056
|
+
// copy logits
|
15057
|
+
{
|
15058
|
+
const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
|
13547
15059
|
|
13548
|
-
|
15060
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
13549
15061
|
|
13550
|
-
|
13551
|
-
|
15062
|
+
if (logits_size) {
|
15063
|
+
data_ctx->write(ctx->logits, logits_size * sizeof(float));
|
15064
|
+
}
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
// copy embeddings
|
15068
|
+
{
|
15069
|
+
const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
|
15070
|
+
|
15071
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
15072
|
+
|
15073
|
+
if (embeddings_size) {
|
15074
|
+
data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
|
15075
|
+
}
|
13552
15076
|
}
|
13553
15077
|
}
|
13554
15078
|
|
@@ -13561,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13561
15085
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13562
15086
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
13563
15087
|
|
13564
|
-
|
15088
|
+
// NOTE: kv_size and kv_buf_size are mostly used for sanity checks
|
13565
15089
|
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
13566
15090
|
const uint32_t kv_size = kv_self.size;
|
15091
|
+
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
13567
15092
|
const uint32_t kv_used = kv_self.used;
|
13568
15093
|
|
13569
15094
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
@@ -13572,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13572
15097
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
13573
15098
|
|
13574
15099
|
if (kv_buf_size) {
|
15100
|
+
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
15101
|
+
|
13575
15102
|
std::vector<uint8_t> tmp_buf;
|
13576
15103
|
for (int il = 0; il < (int) n_layer; ++il) {
|
13577
15104
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -13601,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13601
15128
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
13602
15129
|
}
|
13603
15130
|
}
|
15131
|
+
GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
|
13604
15132
|
}
|
13605
15133
|
|
13606
15134
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
@@ -13645,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13645
15173
|
GGML_ASSERT(!rng_ss.fail());
|
13646
15174
|
}
|
13647
15175
|
|
15176
|
+
// set output ids
|
15177
|
+
{
|
15178
|
+
size_t n_outputs;
|
15179
|
+
std::vector<int32_t> output_pos;
|
15180
|
+
|
15181
|
+
memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
|
15182
|
+
|
15183
|
+
GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
|
15184
|
+
|
15185
|
+
if (n_outputs) {
|
15186
|
+
output_pos.resize(n_outputs);
|
15187
|
+
memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
|
15188
|
+
inp += n_outputs * sizeof(int32_t);
|
15189
|
+
|
15190
|
+
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
|
15191
|
+
int32_t id = output_pos[i];
|
15192
|
+
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15193
|
+
ctx->output_ids[id] = i;
|
15194
|
+
}
|
15195
|
+
}
|
15196
|
+
}
|
15197
|
+
|
13648
15198
|
// set logits
|
13649
15199
|
{
|
13650
15200
|
size_t logits_size;
|
@@ -13665,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13665
15215
|
|
13666
15216
|
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
13667
15217
|
|
13668
|
-
GGML_ASSERT(ctx->embd_size
|
15218
|
+
GGML_ASSERT(ctx->embd_size >= embeddings_size);
|
13669
15219
|
|
13670
15220
|
if (embeddings_size) {
|
13671
15221
|
memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
|
@@ -13692,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13692
15242
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
13693
15243
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
13694
15244
|
|
15245
|
+
if (kv_self.size != kv_size) {
|
15246
|
+
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
15247
|
+
GGML_ASSERT(kv_self.size >= kv_head);
|
15248
|
+
|
15249
|
+
LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
|
15250
|
+
__func__, kv_head, kv_size, kv_self.size);
|
15251
|
+
}
|
15252
|
+
|
13695
15253
|
if (kv_buf_size) {
|
13696
|
-
|
15254
|
+
const size_t pre_kv_buf_size = inp - src;
|
15255
|
+
|
15256
|
+
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
|
13697
15257
|
|
13698
15258
|
for (int il = 0; il < (int) n_layer; ++il) {
|
13699
15259
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -13713,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13713
15273
|
|
13714
15274
|
// v is not contiguous, copy row by row
|
13715
15275
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
13716
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
15276
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
|
13717
15277
|
|
13718
15278
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
13719
15279
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
13720
15280
|
inp += v_row_size;
|
13721
15281
|
}
|
13722
15282
|
}
|
15283
|
+
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
13723
15284
|
}
|
13724
15285
|
|
13725
|
-
|
15286
|
+
llama_kv_cache_clear(ctx);
|
13726
15287
|
|
13727
15288
|
ctx->kv_self.head = kv_head;
|
13728
|
-
ctx->kv_self.size = kv_size;
|
13729
15289
|
ctx->kv_self.used = kv_used;
|
13730
15290
|
|
13731
|
-
ctx->kv_self.cells.resize(kv_size);
|
13732
|
-
|
13733
15291
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
13734
15292
|
llama_pos pos;
|
13735
15293
|
size_t seq_id_size;
|
@@ -13746,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13746
15304
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
13747
15305
|
}
|
13748
15306
|
}
|
13749
|
-
|
13750
|
-
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
13751
|
-
ctx->kv_self.cells[i].pos = -1;
|
13752
|
-
ctx->kv_self.cells[i].seq_id.clear();
|
13753
|
-
}
|
13754
15307
|
}
|
13755
15308
|
|
13756
15309
|
const size_t nread = inp - src;
|
@@ -13956,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
13956
15509
|
}
|
13957
15510
|
|
13958
15511
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
13959
|
-
assert(ctx->logits_valid.at(i));
|
13960
|
-
|
13961
15512
|
llama_synchronize(ctx);
|
13962
15513
|
|
13963
|
-
|
15514
|
+
try {
|
15515
|
+
if (ctx->logits == nullptr) {
|
15516
|
+
throw std::runtime_error("no logits");
|
15517
|
+
}
|
15518
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15519
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15520
|
+
}
|
15521
|
+
const int32_t j = ctx->output_ids[i];
|
15522
|
+
|
15523
|
+
if (j < 0) {
|
15524
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
|
+
}
|
15526
|
+
if ((size_t) j >= ctx->output_size) {
|
15527
|
+
// This should not happen
|
15528
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15529
|
+
}
|
15530
|
+
|
15531
|
+
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
15532
|
+
} catch (const std::exception & err) {
|
15533
|
+
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
15534
|
+
#ifndef NDEBUG
|
15535
|
+
GGML_ASSERT(false);
|
15536
|
+
#endif
|
15537
|
+
return nullptr;
|
15538
|
+
}
|
13964
15539
|
}
|
13965
15540
|
|
13966
15541
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
@@ -13972,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
13972
15547
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
13973
15548
|
llama_synchronize(ctx);
|
13974
15549
|
|
13975
|
-
|
15550
|
+
try {
|
15551
|
+
if (ctx->embd == nullptr) {
|
15552
|
+
throw std::runtime_error("no embeddings");
|
15553
|
+
}
|
15554
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15555
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15556
|
+
}
|
15557
|
+
const int32_t j = ctx->output_ids[i];
|
15558
|
+
|
15559
|
+
if (j < 0) {
|
15560
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
|
+
}
|
15562
|
+
if ((size_t) j >= ctx->output_size) {
|
15563
|
+
// This should not happen
|
15564
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15565
|
+
}
|
15566
|
+
|
15567
|
+
return ctx->embd + j*ctx->model.hparams.n_embd;
|
15568
|
+
} catch (const std::exception & err) {
|
15569
|
+
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
15570
|
+
#ifndef NDEBUG
|
15571
|
+
GGML_ASSERT(false);
|
15572
|
+
#endif
|
15573
|
+
return nullptr;
|
15574
|
+
}
|
13976
15575
|
}
|
13977
15576
|
|
13978
15577
|
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
@@ -14262,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
|
|
14262
15861
|
ss << message->content << "</s>";
|
14263
15862
|
}
|
14264
15863
|
}
|
15864
|
+
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
|
15865
|
+
// openchat/openchat-3.5-0106,
|
15866
|
+
for (auto message : chat) {
|
15867
|
+
std::string role(message->role);
|
15868
|
+
if (role == "system") {
|
15869
|
+
ss << message->content << "<|end_of_turn|>";
|
15870
|
+
} else {
|
15871
|
+
role[0] = toupper(role[0]);
|
15872
|
+
ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
|
15873
|
+
}
|
15874
|
+
}
|
15875
|
+
if (add_ass) {
|
15876
|
+
ss << "GPT4 Correct Assistant:";
|
15877
|
+
}
|
15878
|
+
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
|
15879
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
15880
|
+
for (auto message : chat) {
|
15881
|
+
std::string role(message->role);
|
15882
|
+
if (role == "system") {
|
15883
|
+
// Orca-Vicuna variant uses a system prefix
|
15884
|
+
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
|
15885
|
+
ss << "SYSTEM: " << message->content << "\n";
|
15886
|
+
} else {
|
15887
|
+
ss << message->content << "\n\n";
|
15888
|
+
}
|
15889
|
+
} else if (role == "user") {
|
15890
|
+
ss << "USER: " << message->content << "\n";
|
15891
|
+
} else if (role == "assistant") {
|
15892
|
+
ss << "ASSISTANT: " << message->content << "</s>\n";
|
15893
|
+
}
|
15894
|
+
}
|
15895
|
+
if (add_ass) {
|
15896
|
+
ss << "ASSISTANT:";
|
15897
|
+
}
|
15898
|
+
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
|
15899
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
15900
|
+
for (auto message : chat) {
|
15901
|
+
std::string role(message->role);
|
15902
|
+
if (role == "system") {
|
15903
|
+
ss << message->content;
|
15904
|
+
} else if (role == "user") {
|
15905
|
+
ss << "### Instruction:\n" << message->content << "\n";
|
15906
|
+
} else if (role == "assistant") {
|
15907
|
+
ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
|
15908
|
+
}
|
15909
|
+
}
|
15910
|
+
if (add_ass) {
|
15911
|
+
ss << "### Response:\n";
|
15912
|
+
}
|
14265
15913
|
} else {
|
14266
15914
|
// template not supported
|
14267
15915
|
return -1;
|
@@ -14311,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
14311
15959
|
return res;
|
14312
15960
|
}
|
14313
15961
|
|
15962
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
15963
|
+
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
15964
|
+
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
15965
|
+
return strlen(split_path);
|
15966
|
+
}
|
15967
|
+
return 0;
|
15968
|
+
}
|
15969
|
+
|
15970
|
+
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
15971
|
+
std::string str_split_path(split_path);
|
15972
|
+
char postfix[32];
|
15973
|
+
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
15974
|
+
std::string str_postfix(postfix);
|
15975
|
+
|
15976
|
+
// check if dest ends with postfix
|
15977
|
+
int size_prefix = str_split_path.size() - str_postfix.size();
|
15978
|
+
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
15979
|
+
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
15980
|
+
return size_prefix;
|
15981
|
+
}
|
15982
|
+
|
15983
|
+
return 0;
|
15984
|
+
}
|
15985
|
+
|
14314
15986
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
14315
15987
|
struct llama_timings result = {
|
14316
15988
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|