llama_cpp 0.14.3 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
-
#ifdef
|
10
|
+
#ifdef GGML_USE_CUDA
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
@@ -52,12 +52,16 @@
|
|
52
52
|
#define NOMINMAX
|
53
53
|
#endif
|
54
54
|
#include <windows.h>
|
55
|
+
#ifndef PATH_MAX
|
56
|
+
#define PATH_MAX MAX_PATH
|
57
|
+
#endif
|
55
58
|
#include <io.h>
|
56
59
|
#endif
|
57
60
|
|
58
61
|
#include <algorithm>
|
59
62
|
#include <array>
|
60
63
|
#include <cassert>
|
64
|
+
#include <cctype>
|
61
65
|
#include <cfloat>
|
62
66
|
#include <cinttypes>
|
63
67
|
#include <climits>
|
@@ -68,7 +72,6 @@
|
|
68
72
|
#include <cstdio>
|
69
73
|
#include <cstring>
|
70
74
|
#include <ctime>
|
71
|
-
#include <cwctype>
|
72
75
|
#include <forward_list>
|
73
76
|
#include <fstream>
|
74
77
|
#include <functional>
|
@@ -192,6 +195,7 @@ enum llm_arch {
|
|
192
195
|
LLM_ARCH_LLAMA,
|
193
196
|
LLM_ARCH_FALCON,
|
194
197
|
LLM_ARCH_BAICHUAN,
|
198
|
+
LLM_ARCH_GROK,
|
195
199
|
LLM_ARCH_GPT2,
|
196
200
|
LLM_ARCH_GPTJ,
|
197
201
|
LLM_ARCH_GPTNEOX,
|
@@ -214,6 +218,7 @@ enum llm_arch {
|
|
214
218
|
LLM_ARCH_GEMMA,
|
215
219
|
LLM_ARCH_STARCODER2,
|
216
220
|
LLM_ARCH_MAMBA,
|
221
|
+
LLM_ARCH_XVERSE,
|
217
222
|
LLM_ARCH_COMMAND_R,
|
218
223
|
LLM_ARCH_UNKNOWN,
|
219
224
|
};
|
@@ -221,6 +226,7 @@ enum llm_arch {
|
|
221
226
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
222
227
|
{ LLM_ARCH_LLAMA, "llama" },
|
223
228
|
{ LLM_ARCH_FALCON, "falcon" },
|
229
|
+
{ LLM_ARCH_GROK, "grok" },
|
224
230
|
{ LLM_ARCH_GPT2, "gpt2" },
|
225
231
|
{ LLM_ARCH_GPTJ, "gptj" },
|
226
232
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
@@ -244,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
244
250
|
{ LLM_ARCH_GEMMA, "gemma" },
|
245
251
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
246
252
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
247
254
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
248
255
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
249
256
|
};
|
@@ -290,6 +297,10 @@ enum llm_kv {
|
|
290
297
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
291
298
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
292
299
|
|
300
|
+
LLM_KV_SPLIT_NO,
|
301
|
+
LLM_KV_SPLIT_COUNT,
|
302
|
+
LLM_KV_SPLIT_TENSORS_COUNT,
|
303
|
+
|
293
304
|
LLM_KV_SSM_INNER_SIZE,
|
294
305
|
LLM_KV_SSM_CONV_KERNEL,
|
295
306
|
LLM_KV_SSM_STATE_SIZE,
|
@@ -355,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
355
366
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
356
367
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
357
368
|
|
369
|
+
{ LLM_KV_SPLIT_NO, "split.no" },
|
370
|
+
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
371
|
+
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
|
372
|
+
|
358
373
|
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
359
374
|
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
360
375
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
@@ -411,9 +426,12 @@ enum llm_tensor {
|
|
411
426
|
LLM_TENSOR_FFN_DOWN,
|
412
427
|
LLM_TENSOR_FFN_UP,
|
413
428
|
LLM_TENSOR_FFN_ACT,
|
414
|
-
LLM_TENSOR_FFN_DOWN_EXP,
|
429
|
+
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
415
430
|
LLM_TENSOR_FFN_GATE_EXP,
|
416
431
|
LLM_TENSOR_FFN_UP_EXP,
|
432
|
+
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
433
|
+
LLM_TENSOR_FFN_GATE_EXPS,
|
434
|
+
LLM_TENSOR_FFN_UP_EXPS,
|
417
435
|
LLM_TENSOR_ATTN_Q_NORM,
|
418
436
|
LLM_TENSOR_ATTN_K_NORM,
|
419
437
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -448,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
448
466
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
449
467
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
450
468
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
469
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
470
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
471
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
451
472
|
},
|
452
473
|
},
|
453
474
|
{
|
@@ -483,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
483
504
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
484
505
|
},
|
485
506
|
},
|
507
|
+
{
|
508
|
+
LLM_ARCH_GROK,
|
509
|
+
{
|
510
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
511
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
512
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
513
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
514
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
515
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
516
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
517
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
518
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
519
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
520
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
521
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
522
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
523
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
524
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
525
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
526
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
527
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
528
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
529
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
530
|
+
},
|
531
|
+
},
|
486
532
|
{
|
487
533
|
LLM_ARCH_GPT2,
|
488
534
|
{
|
@@ -548,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
548
594
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
549
595
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
550
596
|
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
597
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
598
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
599
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
551
600
|
},
|
552
601
|
},
|
553
602
|
{
|
@@ -843,6 +892,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
843
892
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
844
893
|
},
|
845
894
|
},
|
895
|
+
{
|
896
|
+
LLM_ARCH_XVERSE,
|
897
|
+
{
|
898
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
899
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
900
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
901
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
902
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
903
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
904
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
905
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
906
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
907
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
908
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
909
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
910
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
911
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
912
|
+
},
|
913
|
+
},
|
846
914
|
{
|
847
915
|
LLM_ARCH_COMMAND_R,
|
848
916
|
{
|
@@ -1030,7 +1098,7 @@ struct llama_file {
|
|
1030
1098
|
size_t size;
|
1031
1099
|
|
1032
1100
|
llama_file(const char * fname, const char * mode) {
|
1033
|
-
fp =
|
1101
|
+
fp = ggml_fopen(fname, mode);
|
1034
1102
|
if (fp == NULL) {
|
1035
1103
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1036
1104
|
}
|
@@ -1099,6 +1167,7 @@ struct llama_file {
|
|
1099
1167
|
}
|
1100
1168
|
}
|
1101
1169
|
};
|
1170
|
+
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1102
1171
|
|
1103
1172
|
struct llama_mmap {
|
1104
1173
|
void * addr;
|
@@ -1299,6 +1368,7 @@ struct llama_mmap {
|
|
1299
1368
|
}
|
1300
1369
|
#endif
|
1301
1370
|
};
|
1371
|
+
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
1302
1372
|
|
1303
1373
|
// Represents some region of memory being locked using mlock or VirtualLock;
|
1304
1374
|
// will automatically unlock on destruction.
|
@@ -1448,6 +1518,7 @@ struct llama_mlock {
|
|
1448
1518
|
static void raw_unlock(const void * addr, size_t len) {}
|
1449
1519
|
#endif
|
1450
1520
|
};
|
1521
|
+
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1451
1522
|
|
1452
1523
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1453
1524
|
std::vector<char> result(8, 0);
|
@@ -1467,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1467
1538
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
1468
1539
|
ggml_backend_buffer_type_t buft = nullptr;
|
1469
1540
|
|
1470
|
-
#if defined(
|
1541
|
+
#if defined(GGML_USE_CUDA)
|
1471
1542
|
// host buffers should only be used when data is expected to be copied to/from the GPU
|
1472
1543
|
if (host_buffer) {
|
1473
1544
|
buft = ggml_backend_cuda_host_buffer_type();
|
@@ -1497,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1497
1568
|
|
1498
1569
|
#ifdef GGML_USE_METAL
|
1499
1570
|
buft = ggml_backend_metal_buffer_type();
|
1500
|
-
#elif defined(
|
1571
|
+
#elif defined(GGML_USE_CUDA)
|
1501
1572
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1502
1573
|
#elif defined(GGML_USE_VULKAN)
|
1503
1574
|
buft = ggml_backend_vk_buffer_type(gpu);
|
@@ -1523,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1523
1594
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1524
1595
|
ggml_backend_buffer_type_t buft = nullptr;
|
1525
1596
|
|
1526
|
-
#ifdef
|
1597
|
+
#ifdef GGML_USE_CUDA
|
1527
1598
|
if (ggml_backend_cuda_get_device_count() > 1) {
|
1528
1599
|
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1529
1600
|
}
|
@@ -1544,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1544
1615
|
}
|
1545
1616
|
|
1546
1617
|
static size_t llama_get_device_count() {
|
1547
|
-
#if defined(
|
1618
|
+
#if defined(GGML_USE_CUDA)
|
1548
1619
|
return ggml_backend_cuda_get_device_count();
|
1549
1620
|
#elif defined(GGML_USE_SYCL)
|
1550
1621
|
return ggml_backend_sycl_get_device_count();
|
@@ -1556,7 +1627,7 @@ static size_t llama_get_device_count() {
|
|
1556
1627
|
}
|
1557
1628
|
|
1558
1629
|
static size_t llama_get_device_memory(int device) {
|
1559
|
-
#if defined(
|
1630
|
+
#if defined(GGML_USE_CUDA)
|
1560
1631
|
size_t total;
|
1561
1632
|
size_t free;
|
1562
1633
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
@@ -1621,6 +1692,7 @@ enum e_model {
|
|
1621
1692
|
MODEL_40B,
|
1622
1693
|
MODEL_65B,
|
1623
1694
|
MODEL_70B,
|
1695
|
+
MODEL_314B,
|
1624
1696
|
MODEL_SMALL,
|
1625
1697
|
MODEL_MEDIUM,
|
1626
1698
|
MODEL_LARGE,
|
@@ -1738,6 +1810,7 @@ struct llama_cparams {
|
|
1738
1810
|
uint32_t n_ctx; // context size used during inference
|
1739
1811
|
uint32_t n_batch;
|
1740
1812
|
uint32_t n_ubatch;
|
1813
|
+
uint32_t n_seq_max;
|
1741
1814
|
uint32_t n_threads; // number of threads to use for generation
|
1742
1815
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1743
1816
|
|
@@ -1803,9 +1876,9 @@ struct llama_layer {
|
|
1803
1876
|
|
1804
1877
|
// ff MoE
|
1805
1878
|
struct ggml_tensor * ffn_gate_inp;
|
1806
|
-
struct ggml_tensor *
|
1807
|
-
struct ggml_tensor *
|
1808
|
-
struct ggml_tensor *
|
1879
|
+
struct ggml_tensor * ffn_gate_exps;
|
1880
|
+
struct ggml_tensor * ffn_down_exps;
|
1881
|
+
struct ggml_tensor * ffn_up_exps ;
|
1809
1882
|
|
1810
1883
|
// ff bias
|
1811
1884
|
struct ggml_tensor * ffn_down_b; // b2
|
@@ -2023,12 +2096,12 @@ struct llama_model {
|
|
2023
2096
|
// the model memory buffers for the tensor data
|
2024
2097
|
std::vector<ggml_backend_buffer_t> bufs;
|
2025
2098
|
|
2026
|
-
// model memory mapped
|
2027
|
-
|
2099
|
+
// model memory mapped files
|
2100
|
+
llama_mmaps mappings;
|
2028
2101
|
|
2029
2102
|
// objects representing data potentially being locked in memory
|
2030
|
-
|
2031
|
-
|
2103
|
+
llama_mlocks mlock_bufs;
|
2104
|
+
llama_mlocks mlock_mmaps;
|
2032
2105
|
|
2033
2106
|
// for quantize-stats only
|
2034
2107
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
@@ -2041,7 +2114,7 @@ struct llama_model {
|
|
2041
2114
|
ggml_free(ctx);
|
2042
2115
|
}
|
2043
2116
|
for (ggml_backend_buffer_t buf : bufs) {
|
2044
|
-
#ifdef
|
2117
|
+
#ifdef GGML_USE_CUDA
|
2045
2118
|
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
2046
2119
|
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
2047
2120
|
}
|
@@ -2060,10 +2133,6 @@ struct llama_context {
|
|
2060
2133
|
ggml_backend_free(backend);
|
2061
2134
|
}
|
2062
2135
|
|
2063
|
-
#ifdef GGML_USE_VULKAN
|
2064
|
-
ggml_vk_free_cpu_assist();
|
2065
|
-
#endif
|
2066
|
-
|
2067
2136
|
ggml_backend_buffer_free(buf_output);
|
2068
2137
|
}
|
2069
2138
|
|
@@ -2100,20 +2169,20 @@ struct llama_context {
|
|
2100
2169
|
// host buffer for the model output (logits and embeddings)
|
2101
2170
|
ggml_backend_buffer_t buf_output = nullptr;
|
2102
2171
|
|
2103
|
-
// decode output (2-dimensional array: [
|
2104
|
-
size_t
|
2105
|
-
float * logits
|
2172
|
+
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
2173
|
+
size_t logits_size = 0; // capacity (of floats) for logits
|
2174
|
+
float * logits = nullptr;
|
2175
|
+
|
2176
|
+
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
|
+
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2106
2179
|
|
2107
|
-
#ifndef NDEBUG
|
2108
|
-
// guard against access to unset logits
|
2109
|
-
std::vector<bool> logits_valid;
|
2110
|
-
#endif
|
2111
2180
|
bool logits_all = false;
|
2112
2181
|
|
2113
|
-
// embeddings output (2-dimensional array: [
|
2182
|
+
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
2114
2183
|
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
2115
|
-
size_t
|
2116
|
-
float * embd
|
2184
|
+
size_t embd_size = 0; // capacity (of floats) for embeddings
|
2185
|
+
float * embd = nullptr;
|
2117
2186
|
|
2118
2187
|
// sequence embeddings output (map of [n_embd] vectors)
|
2119
2188
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
@@ -2130,14 +2199,15 @@ struct llama_context {
|
|
2130
2199
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
2131
2200
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
2132
2201
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2202
|
+
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2133
2203
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2134
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [
|
2204
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2135
2205
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2136
2206
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2137
2207
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
2138
2208
|
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
2139
|
-
struct ggml_tensor * inp_s_mask; // F32 [1,
|
2140
|
-
struct ggml_tensor * inp_s_seq; // I32 [
|
2209
|
+
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
2210
|
+
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
2141
2211
|
|
2142
2212
|
// control vectors
|
2143
2213
|
struct llama_control_vector cvec;
|
@@ -2792,6 +2862,8 @@ namespace GGUFMeta {
|
|
2792
2862
|
};
|
2793
2863
|
}
|
2794
2864
|
|
2865
|
+
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
2866
|
+
|
2795
2867
|
struct llama_model_loader {
|
2796
2868
|
int n_kv = 0;
|
2797
2869
|
int n_tensors = 0;
|
@@ -2802,54 +2874,133 @@ struct llama_model_loader {
|
|
2802
2874
|
|
2803
2875
|
bool use_mmap = false;
|
2804
2876
|
|
2805
|
-
|
2877
|
+
llama_files files;
|
2806
2878
|
llama_ftype ftype;
|
2807
2879
|
llama_fver fver;
|
2808
2880
|
|
2809
|
-
|
2881
|
+
llama_mmaps mappings;
|
2882
|
+
|
2883
|
+
// Holds information on a model weight
|
2884
|
+
struct llama_tensor_weight {
|
2885
|
+
uint16_t idx; // source file index
|
2886
|
+
size_t offs; // tensor data offset in the original file
|
2887
|
+
|
2888
|
+
ggml_tensor * tensor;
|
2889
|
+
|
2890
|
+
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2891
|
+
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2892
|
+
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
2893
|
+
}
|
2894
|
+
};
|
2895
|
+
std::vector<llama_tensor_weight> weights;
|
2896
|
+
|
2810
2897
|
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
2811
2898
|
|
2812
|
-
struct gguf_context *
|
2813
|
-
|
2899
|
+
struct gguf_context * meta = NULL;
|
2900
|
+
std::vector<ggml_context *> contexts;
|
2814
2901
|
|
2815
2902
|
std::string arch_name;
|
2816
2903
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2817
2904
|
|
2818
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p)
|
2905
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
2819
2906
|
int trace = 0;
|
2820
2907
|
if (getenv("LLAMA_TRACE")) {
|
2821
2908
|
trace = atoi(getenv("LLAMA_TRACE"));
|
2822
2909
|
}
|
2823
2910
|
|
2824
|
-
struct gguf_init_params params = {
|
2825
|
-
/*.no_alloc = */ true,
|
2826
|
-
/*.ctx = */ &ctx_meta,
|
2827
|
-
};
|
2828
|
-
|
2829
2911
|
if (param_overrides_p != nullptr) {
|
2830
2912
|
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
2831
2913
|
kv_overrides.insert({std::string(p->key), *p});
|
2832
2914
|
}
|
2833
2915
|
}
|
2834
2916
|
|
2835
|
-
|
2836
|
-
|
2917
|
+
struct ggml_context * ctx = NULL;
|
2918
|
+
struct gguf_init_params params = {
|
2919
|
+
/*.no_alloc = */ true,
|
2920
|
+
/*.ctx = */ &ctx,
|
2921
|
+
};
|
2922
|
+
|
2923
|
+
meta = gguf_init_from_file(fname.c_str(), params);
|
2924
|
+
if (!meta) {
|
2837
2925
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
2838
2926
|
}
|
2839
2927
|
|
2840
2928
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2841
2929
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2842
2930
|
|
2843
|
-
|
2844
|
-
|
2931
|
+
// Save tensors data offset of the main file.
|
2932
|
+
// For subsidiary files, `meta` tensor data offset must not be used,
|
2933
|
+
// so we build a unified tensors index for weights.
|
2934
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2935
|
+
weights.emplace_back(0, cur->name, meta, cur);
|
2936
|
+
}
|
2937
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
2938
|
+
contexts.emplace_back(ctx);
|
2939
|
+
|
2940
|
+
uint16_t n_split = 0;
|
2941
|
+
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
2942
|
+
|
2943
|
+
// Load additional GGML contexts
|
2944
|
+
if (n_split > 1) {
|
2945
|
+
uint16_t idx = 0;
|
2946
|
+
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
2947
|
+
if (idx != 0) {
|
2948
|
+
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
2949
|
+
}
|
2950
|
+
|
2951
|
+
char split_prefix[PATH_MAX] = {0};
|
2952
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
|
2953
|
+
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
2954
|
+
}
|
2955
|
+
|
2956
|
+
if (trace > 0) {
|
2957
|
+
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
2958
|
+
}
|
2959
|
+
|
2960
|
+
char split_path[PATH_MAX] = {0};
|
2961
|
+
for (idx = 1; idx < n_split; idx++) {
|
2962
|
+
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
2963
|
+
|
2964
|
+
struct gguf_init_params split_params = {
|
2965
|
+
/*.no_alloc = */ true,
|
2966
|
+
/*.ctx = */ &ctx,
|
2967
|
+
};
|
2968
|
+
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
|
2969
|
+
if (!ctx_gguf) {
|
2970
|
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
2971
|
+
}
|
2972
|
+
|
2973
|
+
// Save tensors data offset info of the shard.
|
2974
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2975
|
+
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
2976
|
+
}
|
2977
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
2978
|
+
contexts.emplace_back(ctx);
|
2979
|
+
|
2980
|
+
gguf_free(ctx_gguf);
|
2981
|
+
}
|
2982
|
+
|
2983
|
+
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
2984
|
+
|
2985
|
+
// sanity check
|
2986
|
+
{
|
2987
|
+
const int n_tensors_loaded = (int) weights.size();
|
2988
|
+
if (n_tensors != n_tensors_loaded) {
|
2989
|
+
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
2990
|
+
}
|
2991
|
+
}
|
2992
|
+
|
2993
|
+
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
2994
|
+
}
|
2995
|
+
|
2996
|
+
n_kv = gguf_get_n_kv(meta);
|
2997
|
+
n_tensors = weights.size();
|
2845
2998
|
|
2846
|
-
fver = (enum llama_fver
|
2999
|
+
fver = (enum llama_fver) gguf_get_version(meta);
|
2847
3000
|
|
2848
|
-
for (
|
2849
|
-
|
2850
|
-
|
2851
|
-
n_elements += ggml_nelements(t);
|
2852
|
-
n_bytes += ggml_nbytes(t);
|
3001
|
+
for (auto & w : weights) {
|
3002
|
+
n_elements += ggml_nelements(w.tensor);
|
3003
|
+
n_bytes += ggml_nbytes(w.tensor);
|
2853
3004
|
}
|
2854
3005
|
|
2855
3006
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -2864,7 +3015,8 @@ struct llama_model_loader {
|
|
2864
3015
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2865
3016
|
|
2866
3017
|
for (int i = 0; i < n_tensors; i++) {
|
2867
|
-
|
3018
|
+
const ggml_tensor * tensor = weights.at(i).tensor;
|
3019
|
+
enum ggml_type type = tensor->type;
|
2868
3020
|
|
2869
3021
|
n_type[type]++;
|
2870
3022
|
|
@@ -2874,8 +3026,8 @@ struct llama_model_loader {
|
|
2874
3026
|
}
|
2875
3027
|
|
2876
3028
|
if (trace > 0) {
|
2877
|
-
|
2878
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(
|
3029
|
+
const uint16_t sid = weights.at(i).idx;
|
3030
|
+
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
2879
3031
|
}
|
2880
3032
|
}
|
2881
3033
|
|
@@ -2897,6 +3049,7 @@ struct llama_model_loader {
|
|
2897
3049
|
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2898
3050
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2899
3051
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
3052
|
+
case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
|
2900
3053
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2901
3054
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2902
3055
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
@@ -2911,22 +3064,23 @@ struct llama_model_loader {
|
|
2911
3064
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
2912
3065
|
|
2913
3066
|
{
|
2914
|
-
const int kid = gguf_find_key(
|
3067
|
+
const int kid = gguf_find_key(meta, "general.file_type");
|
2915
3068
|
if (kid >= 0) {
|
2916
|
-
ftype = (llama_ftype) gguf_get_val_u32(
|
3069
|
+
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
2917
3070
|
}
|
2918
3071
|
}
|
2919
3072
|
|
2920
3073
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
3074
|
+
|
2921
3075
|
for (int i = 0; i < n_kv; i++) {
|
2922
|
-
const char * name = gguf_get_key(
|
2923
|
-
const enum gguf_type type = gguf_get_kv_type(
|
3076
|
+
const char * name = gguf_get_key(meta, i);
|
3077
|
+
const enum gguf_type type = gguf_get_kv_type(meta, i);
|
2924
3078
|
const std::string type_name =
|
2925
3079
|
type == GGUF_TYPE_ARRAY
|
2926
|
-
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(
|
3080
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
|
2927
3081
|
: gguf_type_name(type);
|
2928
3082
|
|
2929
|
-
std::string value = gguf_kv_to_str(
|
3083
|
+
std::string value = gguf_kv_to_str(meta, i);
|
2930
3084
|
const size_t MAX_VALUE_LEN = 40;
|
2931
3085
|
if (value.size() > MAX_VALUE_LEN) {
|
2932
3086
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
@@ -2955,18 +3109,18 @@ struct llama_model_loader {
|
|
2955
3109
|
}
|
2956
3110
|
|
2957
3111
|
~llama_model_loader() {
|
2958
|
-
if (
|
2959
|
-
gguf_free(
|
3112
|
+
if (meta) {
|
3113
|
+
gguf_free(meta);
|
2960
3114
|
}
|
2961
|
-
|
2962
|
-
ggml_free(
|
3115
|
+
for (auto * ctx : contexts) {
|
3116
|
+
ggml_free(ctx);
|
2963
3117
|
}
|
2964
3118
|
}
|
2965
3119
|
|
2966
3120
|
template<typename T>
|
2967
3121
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2968
3122
|
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2969
|
-
const int kid = gguf_find_key(
|
3123
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
2970
3124
|
|
2971
3125
|
if (kid < 0) {
|
2972
3126
|
if (required) {
|
@@ -2976,7 +3130,7 @@ struct llama_model_loader {
|
|
2976
3130
|
}
|
2977
3131
|
|
2978
3132
|
struct GGUFMeta::ArrayInfo arr_info =
|
2979
|
-
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(
|
3133
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
2980
3134
|
|
2981
3135
|
|
2982
3136
|
result = arr_info.length;
|
@@ -2996,7 +3150,7 @@ struct llama_model_loader {
|
|
2996
3150
|
const struct llama_model_kv_override * override =
|
2997
3151
|
it != kv_overrides.end() ? &it->second : nullptr;
|
2998
3152
|
|
2999
|
-
const bool found = GGUFMeta::GKV<T>::set(
|
3153
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
3000
3154
|
|
3001
3155
|
if (required && !found) {
|
3002
3156
|
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
@@ -3019,28 +3173,57 @@ struct llama_model_loader {
|
|
3019
3173
|
}
|
3020
3174
|
|
3021
3175
|
const char * get_tensor_name(int i) const {
|
3022
|
-
return
|
3176
|
+
return weights.at(i).tensor->name;
|
3177
|
+
}
|
3178
|
+
|
3179
|
+
const llama_tensor_weight * get_weight(const char * name) const {
|
3180
|
+
for (const auto & weight : weights) {
|
3181
|
+
if (strcmp(name, weight.tensor->name) == 0) {
|
3182
|
+
return &weight;
|
3183
|
+
}
|
3184
|
+
}
|
3185
|
+
return nullptr;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
const llama_tensor_weight & require_weight(const char * name) const {
|
3189
|
+
const llama_tensor_weight * weight = get_weight(name);
|
3190
|
+
if (!weight) {
|
3191
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3192
|
+
}
|
3193
|
+
return *weight;
|
3023
3194
|
}
|
3024
3195
|
|
3025
3196
|
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
3026
|
-
|
3197
|
+
const auto * weight = get_weight(name);
|
3198
|
+
if (!weight) {
|
3199
|
+
return nullptr;
|
3200
|
+
}
|
3201
|
+
return weight->tensor;
|
3202
|
+
}
|
3203
|
+
|
3204
|
+
struct ggml_tensor * require_tensor_meta(const char * name) const {
|
3205
|
+
struct ggml_tensor * tensor = get_tensor_meta(name);
|
3206
|
+
if (!tensor) {
|
3207
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3208
|
+
}
|
3209
|
+
return tensor;
|
3027
3210
|
}
|
3028
3211
|
|
3029
3212
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
3030
3213
|
return get_tensor_meta(get_tensor_name(i));
|
3031
3214
|
}
|
3032
3215
|
|
3033
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor *
|
3034
|
-
struct ggml_tensor * tensor = ggml_dup_tensor(ctx,
|
3035
|
-
ggml_set_name(tensor, ggml_get_name(
|
3216
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3217
|
+
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3218
|
+
ggml_set_name(tensor, ggml_get_name(cur));
|
3036
3219
|
|
3037
3220
|
n_created++;
|
3038
3221
|
|
3039
3222
|
return tensor;
|
3040
3223
|
}
|
3041
3224
|
|
3042
|
-
struct ggml_tensor *
|
3043
|
-
struct ggml_tensor * cur =
|
3225
|
+
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
|
3226
|
+
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
3044
3227
|
|
3045
3228
|
if (cur == NULL) {
|
3046
3229
|
if (!required) {
|
@@ -3051,8 +3234,8 @@ struct llama_model_loader {
|
|
3051
3234
|
|
3052
3235
|
{
|
3053
3236
|
bool is_ok = true;
|
3054
|
-
for (size_t i = 0; i <
|
3055
|
-
if (ne[i] != cur->ne[i]) {
|
3237
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3238
|
+
if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
|
3056
3239
|
is_ok = false;
|
3057
3240
|
break;
|
3058
3241
|
}
|
@@ -3066,127 +3249,196 @@ struct llama_model_loader {
|
|
3066
3249
|
}
|
3067
3250
|
}
|
3068
3251
|
|
3069
|
-
return
|
3252
|
+
return cur;
|
3070
3253
|
}
|
3071
3254
|
|
3072
|
-
|
3073
|
-
|
3074
|
-
|
3255
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
3256
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3257
|
+
|
3258
|
+
if (cur == NULL) {
|
3259
|
+
return NULL;
|
3075
3260
|
}
|
3261
|
+
|
3262
|
+
return create_tensor_for(ctx, cur);
|
3076
3263
|
}
|
3077
3264
|
|
3078
|
-
|
3079
|
-
const
|
3265
|
+
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
3266
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3080
3267
|
|
3081
|
-
if (
|
3082
|
-
|
3268
|
+
if (cur == NULL) {
|
3269
|
+
return NULL;
|
3083
3270
|
}
|
3084
3271
|
|
3085
|
-
|
3086
|
-
|
3272
|
+
if (cur->type != base->type) {
|
3273
|
+
throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
|
3274
|
+
}
|
3087
3275
|
|
3088
|
-
|
3089
|
-
|
3090
|
-
|
3091
|
-
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
3276
|
+
std::array<int64_t, GGML_MAX_DIMS> dims;
|
3277
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3278
|
+
dims[i] = i < ne.size() ? ne[i] : 1;
|
3092
3279
|
}
|
3093
3280
|
|
3094
|
-
|
3095
|
-
|
3096
|
-
|
3097
|
-
|
3281
|
+
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
3282
|
+
dims[0], dims[1], dims[2], dims[3],
|
3283
|
+
cur->nb[1], cur->nb[2], cur->nb[3],
|
3284
|
+
offset);
|
3285
|
+
|
3286
|
+
ggml_set_name(tensor, name.c_str());
|
3287
|
+
|
3288
|
+
n_created++;
|
3289
|
+
|
3290
|
+
return tensor;
|
3291
|
+
}
|
3292
|
+
|
3293
|
+
void done_getting_tensors() const {
|
3294
|
+
if (n_created != n_tensors) {
|
3295
|
+
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
3098
3296
|
}
|
3297
|
+
}
|
3099
3298
|
|
3100
|
-
|
3101
|
-
|
3102
|
-
|
3299
|
+
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
|
3300
|
+
if (use_mmap) {
|
3301
|
+
mappings.reserve(files.size());
|
3302
|
+
mmaps_used.reserve(files.size());
|
3303
|
+
for (const auto & file : files) {
|
3304
|
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
3305
|
+
mmaps_used.emplace_back(mapping->size, 0);
|
3306
|
+
if (mlock_mmaps) {
|
3307
|
+
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
3308
|
+
mlock_mmap->init(mapping->addr);
|
3309
|
+
mlock_mmaps->emplace_back(std::move(mlock_mmap));
|
3310
|
+
}
|
3311
|
+
mappings.emplace_back(std::move(mapping));
|
3103
3312
|
}
|
3104
|
-
|
3313
|
+
}
|
3314
|
+
|
3315
|
+
// compute the total size of all tensors for progress reporting
|
3316
|
+
for (auto & w : weights) {
|
3317
|
+
size_data += ggml_nbytes(w.tensor);
|
3105
3318
|
}
|
3106
3319
|
}
|
3107
3320
|
|
3108
|
-
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
3109
|
-
GGML_ASSERT(
|
3321
|
+
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
|
3322
|
+
GGML_ASSERT(!mappings.empty());
|
3323
|
+
const auto & mapping = mappings.at(idx);
|
3110
3324
|
|
3111
3325
|
*first = mapping->size;
|
3112
3326
|
*last = 0;
|
3327
|
+
*addr = mapping->addr;
|
3113
3328
|
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
3114
|
-
|
3115
|
-
|
3116
|
-
|
3329
|
+
try {
|
3330
|
+
const auto * weight = get_weight(ggml_get_name(tensor));
|
3331
|
+
if (!weight) {
|
3332
|
+
continue;
|
3333
|
+
}
|
3334
|
+
if (weight->idx != idx) {
|
3335
|
+
continue;
|
3336
|
+
}
|
3337
|
+
*first = std::min(*first, weight->offs);
|
3338
|
+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
3339
|
+
} catch(...) {
|
3340
|
+
// the tensor is not in the model
|
3341
|
+
}
|
3117
3342
|
}
|
3118
3343
|
}
|
3119
3344
|
|
3120
3345
|
// for backwards compatibility, does not support ggml-backend
|
3121
3346
|
void load_data_for(struct ggml_tensor * cur) const {
|
3122
|
-
const
|
3347
|
+
const auto & w = require_weight(ggml_get_name(cur));
|
3123
3348
|
|
3124
|
-
if (use_mmap
|
3349
|
+
if (use_mmap) {
|
3350
|
+
const auto & mapping = mappings.at(w.idx);
|
3125
3351
|
if (cur->data == nullptr) {
|
3126
|
-
cur->data = (uint8_t *)mapping->addr + offs;
|
3352
|
+
cur->data = (uint8_t *)mapping->addr + w.offs;
|
3127
3353
|
} else {
|
3128
|
-
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
3354
|
+
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
3129
3355
|
}
|
3130
3356
|
} else {
|
3131
3357
|
GGML_ASSERT(cur->data != nullptr);
|
3132
|
-
|
3133
|
-
file.
|
3358
|
+
GGML_ASSERT(w.idx < files.size());
|
3359
|
+
const auto & file = files.at(w.idx);
|
3360
|
+
file->seek(w.offs, SEEK_SET);
|
3361
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3134
3362
|
}
|
3135
3363
|
}
|
3136
3364
|
|
3137
3365
|
size_t size_done = 0;
|
3138
3366
|
size_t size_data = 0;
|
3139
|
-
size_t
|
3140
|
-
size_t mmap_used_last = 0;
|
3367
|
+
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
3141
3368
|
|
3142
3369
|
// Returns false if cancelled by progress_callback
|
3143
|
-
bool load_all_data(
|
3144
|
-
|
3370
|
+
bool load_all_data(
|
3371
|
+
struct ggml_context * ctx,
|
3372
|
+
llama_buf_map & bufs_mmap,
|
3373
|
+
llama_mlocks * lmlocks,
|
3374
|
+
llama_progress_callback progress_callback,
|
3375
|
+
void * progress_callback_user_data) {
|
3376
|
+
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3145
3377
|
|
3146
3378
|
std::vector<no_init<uint8_t>> read_buf;
|
3147
|
-
|
3148
3379
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3380
|
+
const auto * weight = get_weight(ggml_get_name(cur));
|
3381
|
+
if (weight == nullptr) {
|
3382
|
+
// this can happen with split experts models
|
3383
|
+
continue;
|
3384
|
+
}
|
3385
|
+
|
3149
3386
|
if (progress_callback) {
|
3150
3387
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
3151
3388
|
return false;
|
3152
3389
|
}
|
3153
3390
|
}
|
3154
3391
|
|
3155
|
-
|
3392
|
+
size_t n_size = ggml_nbytes(cur);
|
3156
3393
|
|
3157
|
-
if (use_mmap
|
3394
|
+
if (use_mmap) {
|
3395
|
+
const auto & mapping = mappings.at(weight->idx);
|
3396
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3397
|
+
if (bufs_mmap.count(weight->idx)) {
|
3398
|
+
buf_mmap = bufs_mmap.at(weight->idx);
|
3399
|
+
}
|
3400
|
+
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3158
3401
|
if (buf_mmap && cur->data == nullptr) {
|
3159
|
-
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
3160
|
-
if (
|
3161
|
-
lmlock
|
3402
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
|
3403
|
+
if (lmlocks) {
|
3404
|
+
const auto & lmlock = lmlocks->at(weight->idx);
|
3405
|
+
lmlock->grow_to(weight->offs + ggml_nbytes(cur));
|
3162
3406
|
}
|
3163
|
-
|
3164
|
-
|
3407
|
+
|
3408
|
+
auto & mmap_used = mmaps_used[weight->idx];
|
3409
|
+
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3410
|
+
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3165
3411
|
} else {
|
3166
|
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0,
|
3412
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
|
3167
3413
|
}
|
3168
3414
|
} else {
|
3415
|
+
GGML_ASSERT(weight->idx < files.size());
|
3416
|
+
const auto & file = files.at(weight->idx);
|
3169
3417
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3170
|
-
file
|
3171
|
-
file
|
3418
|
+
file->seek(weight->offs, SEEK_SET);
|
3419
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3172
3420
|
} else {
|
3173
3421
|
read_buf.resize(ggml_nbytes(cur));
|
3174
|
-
file
|
3175
|
-
file
|
3176
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0,
|
3422
|
+
file->seek(weight->offs, SEEK_SET);
|
3423
|
+
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
3424
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3177
3425
|
}
|
3178
3426
|
}
|
3179
3427
|
|
3180
|
-
size_done +=
|
3428
|
+
size_done += n_size;
|
3181
3429
|
}
|
3182
3430
|
|
3183
3431
|
// check if this is the last call and do final cleanup
|
3184
3432
|
if (size_done >= size_data) {
|
3185
3433
|
// unmap offloaded tensors and metadata
|
3186
|
-
if (use_mmap
|
3187
|
-
|
3188
|
-
|
3189
|
-
mapping
|
3434
|
+
if (use_mmap) {
|
3435
|
+
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
|
3436
|
+
const auto & mmap_used = mmaps_used.at(idx);
|
3437
|
+
auto & mapping = mappings.at(idx);
|
3438
|
+
mapping->unmap_fragment(0, mmap_used.first);
|
3439
|
+
if (mmap_used.second != 0) {
|
3440
|
+
mapping->unmap_fragment(mmap_used.second, mapping->size);
|
3441
|
+
}
|
3190
3442
|
}
|
3191
3443
|
}
|
3192
3444
|
if (progress_callback) {
|
@@ -3259,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3259
3511
|
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
3260
3512
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
3261
3513
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
3514
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
|
3262
3515
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
3263
3516
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
3264
3517
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
@@ -3290,6 +3543,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3290
3543
|
case MODEL_40B: return "40B";
|
3291
3544
|
case MODEL_65B: return "65B";
|
3292
3545
|
case MODEL_70B: return "70B";
|
3546
|
+
case MODEL_314B: return "314B";
|
3293
3547
|
case MODEL_SMALL: return "0.1B";
|
3294
3548
|
case MODEL_MEDIUM: return "0.4B";
|
3295
3549
|
case MODEL_LARGE: return "0.8B";
|
@@ -3319,7 +3573,7 @@ static void llm_load_hparams(
|
|
3319
3573
|
llama_model_loader & ml,
|
3320
3574
|
llama_model & model) {
|
3321
3575
|
auto & hparams = model.hparams;
|
3322
|
-
const gguf_context * ctx = ml.
|
3576
|
+
const gguf_context * ctx = ml.meta;
|
3323
3577
|
|
3324
3578
|
// get metadata as string
|
3325
3579
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -3428,6 +3682,15 @@ static void llm_load_hparams(
|
|
3428
3682
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3429
3683
|
}
|
3430
3684
|
} break;
|
3685
|
+
case LLM_ARCH_GROK:
|
3686
|
+
{
|
3687
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3688
|
+
|
3689
|
+
switch (hparams.n_layer) {
|
3690
|
+
case 64: model.type = e_model::MODEL_314B; break;
|
3691
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3692
|
+
}
|
3693
|
+
} break;
|
3431
3694
|
case LLM_ARCH_FALCON:
|
3432
3695
|
{
|
3433
3696
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3679,6 +3942,16 @@ static void llm_load_hparams(
|
|
3679
3942
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3680
3943
|
}
|
3681
3944
|
} break;
|
3945
|
+
case LLM_ARCH_XVERSE:
|
3946
|
+
{
|
3947
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3948
|
+
switch (hparams.n_layer) {
|
3949
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3950
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3951
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
3952
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3953
|
+
}
|
3954
|
+
} break;
|
3682
3955
|
case LLM_ARCH_COMMAND_R:
|
3683
3956
|
{
|
3684
3957
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
@@ -3709,7 +3982,7 @@ static void llm_load_vocab(
|
|
3709
3982
|
llama_model & model) {
|
3710
3983
|
auto & vocab = model.vocab;
|
3711
3984
|
|
3712
|
-
struct gguf_context * ctx = ml.
|
3985
|
+
struct gguf_context * ctx = ml.meta;
|
3713
3986
|
|
3714
3987
|
const auto kv = LLM_KV(model.arch);
|
3715
3988
|
|
@@ -3842,7 +4115,7 @@ static void llm_load_vocab(
|
|
3842
4115
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3843
4116
|
vocab.linefeed_id = vocab.special_pad_id;
|
3844
4117
|
} else {
|
3845
|
-
const std::vector<int> ids = llama_tokenize_internal(vocab, "\
|
4118
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
3846
4119
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
3847
4120
|
vocab.linefeed_id = ids[0];
|
3848
4121
|
}
|
@@ -4075,6 +4348,7 @@ static bool llm_load_tensors(
|
|
4075
4348
|
|
4076
4349
|
const int64_t n_layer = hparams.n_layer;
|
4077
4350
|
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
4351
|
+
bool use_mmap_buffer = true;
|
4078
4352
|
|
4079
4353
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
4080
4354
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
@@ -4163,6 +4437,10 @@ static bool llm_load_tensors(
|
|
4163
4437
|
|
4164
4438
|
// create one context per buffer type
|
4165
4439
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4440
|
+
|
4441
|
+
// for moe merged tensors
|
4442
|
+
ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
|
4443
|
+
|
4166
4444
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4167
4445
|
for (auto & it : buft_layer_count) {
|
4168
4446
|
struct ggml_init_params params = {
|
@@ -4189,6 +4467,11 @@ static bool llm_load_tensors(
|
|
4189
4467
|
const int64_t n_vocab = hparams.n_vocab;
|
4190
4468
|
const int64_t n_vocab_type = hparams.n_vocab_type;
|
4191
4469
|
const int64_t n_ff = hparams.n_ff;
|
4470
|
+
const int64_t n_expert = hparams.n_expert;
|
4471
|
+
|
4472
|
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
4473
|
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
4474
|
+
}
|
4192
4475
|
|
4193
4476
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
4194
4477
|
|
@@ -4243,26 +4526,113 @@ static bool llm_load_tensors(
|
|
4243
4526
|
|
4244
4527
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4245
4528
|
|
4246
|
-
|
4247
|
-
|
4248
|
-
if (layer.ffn_gate_inp == nullptr) {
|
4249
|
-
GGML_ASSERT(hparams.n_expert == 0);
|
4250
|
-
GGML_ASSERT(hparams.n_expert_used == 0);
|
4251
|
-
|
4529
|
+
if (n_expert == 0) {
|
4252
4530
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4253
4531
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4254
4532
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4255
4533
|
} else {
|
4256
|
-
|
4257
|
-
|
4258
|
-
|
4259
|
-
|
4260
|
-
|
4261
|
-
layer.
|
4262
|
-
|
4263
|
-
|
4534
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4535
|
+
|
4536
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4537
|
+
if (layer.ffn_gate_exps) {
|
4538
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4539
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4540
|
+
} else {
|
4541
|
+
// merge split expert into a single tensor for compatibility with older models
|
4542
|
+
// requires disabling mmap
|
4543
|
+
use_mmap_buffer = false;
|
4544
|
+
|
4545
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4546
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4547
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4548
|
+
|
4549
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4550
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4551
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4552
|
+
|
4553
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4554
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4555
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4556
|
+
|
4557
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4558
|
+
// the individual experts are loaded into a view of the merged tensor
|
4559
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4560
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4561
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4562
|
+
}
|
4563
|
+
}
|
4564
|
+
}
|
4565
|
+
}
|
4566
|
+
} break;
|
4567
|
+
case LLM_ARCH_GROK:
|
4568
|
+
{
|
4569
|
+
if (n_expert == 0) {
|
4570
|
+
throw std::runtime_error("Grok model cannot have zero experts");
|
4571
|
+
}
|
4572
|
+
|
4573
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4574
|
+
|
4575
|
+
// output
|
4576
|
+
{
|
4577
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4578
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4579
|
+
// if output is NULL, init from the input tok embed
|
4580
|
+
if (model.output == NULL) {
|
4581
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4582
|
+
ml.n_created--; // artificial tensor
|
4583
|
+
ml.size_data += ggml_nbytes(model.output);
|
4584
|
+
}
|
4585
|
+
}
|
4586
|
+
|
4587
|
+
for (int i = 0; i < n_layer; ++i) {
|
4588
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4589
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4590
|
+
|
4591
|
+
auto & layer = model.layers[i];
|
4592
|
+
|
4593
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4594
|
+
|
4595
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4596
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4597
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4598
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4599
|
+
|
4600
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4601
|
+
|
4602
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4603
|
+
|
4604
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4605
|
+
|
4606
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4607
|
+
if (layer.ffn_gate_exps) {
|
4608
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4609
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4610
|
+
} else {
|
4611
|
+
// merge split expert into a single tensor for compatibility with older models
|
4612
|
+
// requires disabling mmap
|
4613
|
+
use_mmap_buffer = false;
|
4614
|
+
|
4615
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4616
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4617
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4618
|
+
|
4619
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4620
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4621
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4622
|
+
|
4623
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4624
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4625
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4626
|
+
|
4627
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4628
|
+
// the individual experts are loaded into a view of the merged tensor
|
4629
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4630
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4631
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4264
4632
|
}
|
4265
4633
|
}
|
4634
|
+
|
4635
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4266
4636
|
}
|
4267
4637
|
} break;
|
4268
4638
|
case LLM_ARCH_BAICHUAN:
|
@@ -4319,10 +4689,8 @@ static bool llm_load_tensors(
|
|
4319
4689
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4320
4690
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4321
4691
|
|
4322
|
-
|
4323
|
-
|
4324
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
4325
|
-
}
|
4692
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
|
4693
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
|
4326
4694
|
|
4327
4695
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4328
4696
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -4502,6 +4870,7 @@ static bool llm_load_tensors(
|
|
4502
4870
|
case LLM_ARCH_MPT:
|
4503
4871
|
{
|
4504
4872
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4873
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
|
4505
4874
|
|
4506
4875
|
// output
|
4507
4876
|
{
|
@@ -4540,6 +4909,12 @@ static bool llm_load_tensors(
|
|
4540
4909
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4541
4910
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
4542
4911
|
|
4912
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
4913
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
4914
|
+
|
4915
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
4916
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
4917
|
+
|
4543
4918
|
// AWQ ScaleActivation layer
|
4544
4919
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
4545
4920
|
}
|
@@ -4986,6 +5361,28 @@ static bool llm_load_tensors(
|
|
4986
5361
|
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
4987
5362
|
}
|
4988
5363
|
} break;
|
5364
|
+
case LLM_ARCH_XVERSE:
|
5365
|
+
{
|
5366
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5367
|
+
{
|
5368
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5369
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5370
|
+
}
|
5371
|
+
for (int i = 0; i < n_layer; ++i) {
|
5372
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5373
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5374
|
+
auto & layer = model.layers[i];
|
5375
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5376
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5377
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5378
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5379
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5380
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5381
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5382
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5383
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5384
|
+
}
|
5385
|
+
} break;
|
4989
5386
|
case LLM_ARCH_COMMAND_R:
|
4990
5387
|
{
|
4991
5388
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5024,56 +5421,97 @@ static bool llm_load_tensors(
|
|
5024
5421
|
|
5025
5422
|
ml.done_getting_tensors();
|
5026
5423
|
|
5027
|
-
ml.
|
5424
|
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
5425
|
+
model.mappings.reserve(ml.mappings.size());
|
5028
5426
|
|
5029
5427
|
// create the backend buffers
|
5030
|
-
std::vector<std::pair<ggml_context *,
|
5428
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
5429
|
+
ctx_bufs.reserve(ctx_map.size());
|
5430
|
+
|
5431
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
5432
|
+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
5433
|
+
model.bufs.reserve(n_max_backend_buffer);
|
5031
5434
|
|
5032
5435
|
for (auto & it : ctx_map) {
|
5033
5436
|
ggml_backend_buffer_type_t buft = it.first;
|
5034
|
-
ggml_context * ctx
|
5035
|
-
|
5437
|
+
ggml_context * ctx = it.second;
|
5438
|
+
|
5439
|
+
llama_buf_map bufs;
|
5440
|
+
bufs.reserve(n_max_backend_buffer);
|
5036
5441
|
|
5037
5442
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
5038
5443
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
5039
5444
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
5040
|
-
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
5041
|
-
|
5042
|
-
|
5043
|
-
|
5044
|
-
|
5045
|
-
|
5046
|
-
|
5445
|
+
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
5446
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5447
|
+
void * addr = nullptr;
|
5448
|
+
size_t first, last;
|
5449
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5450
|
+
if (first >= last) {
|
5451
|
+
continue;
|
5452
|
+
}
|
5453
|
+
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
|
5454
|
+
if (buf == nullptr) {
|
5455
|
+
throw std::runtime_error("unable to allocate backend CPU buffer");
|
5456
|
+
}
|
5457
|
+
model.bufs.push_back(buf);
|
5458
|
+
bufs.emplace(idx, buf);
|
5459
|
+
#ifdef GGML_USE_CUDA
|
5460
|
+
if (n_layer >= n_gpu_layers) {
|
5461
|
+
ggml_backend_cuda_register_host_buffer(
|
5047
5462
|
ggml_backend_buffer_get_base(buf),
|
5048
5463
|
ggml_backend_buffer_get_size(buf));
|
5049
|
-
|
5464
|
+
}
|
5050
5465
|
#endif
|
5466
|
+
}
|
5051
5467
|
}
|
5052
5468
|
#ifdef GGML_USE_METAL
|
5053
|
-
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
5054
|
-
|
5055
|
-
|
5056
|
-
|
5057
|
-
|
5469
|
+
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
5470
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5471
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
5472
|
+
void * addr = nullptr;
|
5473
|
+
size_t first, last;
|
5474
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5475
|
+
if (first >= last) {
|
5476
|
+
continue;
|
5477
|
+
}
|
5478
|
+
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
|
5479
|
+
if (buf == nullptr) {
|
5480
|
+
throw std::runtime_error("unable to allocate backend metal buffer");
|
5481
|
+
}
|
5482
|
+
model.bufs.push_back(buf);
|
5483
|
+
bufs.emplace(idx, buf);
|
5484
|
+
}
|
5058
5485
|
}
|
5059
5486
|
#endif
|
5060
5487
|
else {
|
5061
|
-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5062
|
-
if (buf
|
5488
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5489
|
+
if (buf == nullptr) {
|
5490
|
+
throw std::runtime_error("unable to allocate backend buffer");
|
5491
|
+
}
|
5492
|
+
model.bufs.push_back(buf);
|
5493
|
+
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
5063
5494
|
model.mlock_bufs.emplace_back(new llama_mlock);
|
5064
5495
|
auto & mlock_buf = model.mlock_bufs.back();
|
5065
5496
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
5066
5497
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
5067
5498
|
}
|
5499
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5500
|
+
bufs.emplace(idx, buf);
|
5501
|
+
}
|
5068
5502
|
}
|
5069
|
-
|
5503
|
+
|
5504
|
+
if (bufs.empty()) {
|
5070
5505
|
throw std::runtime_error("failed to allocate buffer");
|
5071
5506
|
}
|
5072
|
-
|
5073
|
-
|
5074
|
-
|
5075
|
-
|
5076
|
-
|
5507
|
+
|
5508
|
+
for (auto & buf : bufs) {
|
5509
|
+
// indicate that this buffer contains weights
|
5510
|
+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
5511
|
+
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
5512
|
+
}
|
5513
|
+
|
5514
|
+
ctx_bufs.emplace_back(ctx, bufs);
|
5077
5515
|
}
|
5078
5516
|
|
5079
5517
|
if (llama_supports_gpu_offload()) {
|
@@ -5105,13 +5543,17 @@ static bool llm_load_tensors(
|
|
5105
5543
|
// load tensor data
|
5106
5544
|
for (auto & it : ctx_bufs) {
|
5107
5545
|
ggml_context * ctx = it.first;
|
5108
|
-
|
5109
|
-
if (!ml.load_all_data(ctx,
|
5546
|
+
auto & bufs = it.second;
|
5547
|
+
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
5110
5548
|
return false;
|
5111
5549
|
}
|
5112
5550
|
}
|
5113
5551
|
|
5114
|
-
|
5552
|
+
if (use_mmap_buffer) {
|
5553
|
+
for (auto & mapping : ml.mappings) {
|
5554
|
+
model.mappings.emplace_back(std::move(mapping));
|
5555
|
+
}
|
5556
|
+
}
|
5115
5557
|
|
5116
5558
|
// loading time will be recalculate after the first eval, so
|
5117
5559
|
// we take page faults deferred by mmap() into consideration
|
@@ -5266,8 +5708,8 @@ static void llm_build_kv_store(
|
|
5266
5708
|
GGML_ASSERT(kv.size == n_ctx);
|
5267
5709
|
|
5268
5710
|
// compute the transposed [n_tokens, n_embd] V matrix
|
5269
|
-
|
5270
|
-
|
5711
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
5712
|
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
5271
5713
|
cb(v_cur_t, "v_cur_t", il);
|
5272
5714
|
|
5273
5715
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
@@ -5451,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
5451
5893
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5452
5894
|
}
|
5453
5895
|
|
5896
|
+
if (model.arch == LLM_ARCH_GROK) {
|
5897
|
+
// need to do the following:
|
5898
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
5899
|
+
// and then :
|
5900
|
+
// kq = 30 * tanh(kq / 30)
|
5901
|
+
// before the softmax below
|
5902
|
+
|
5903
|
+
//try from phi2
|
5904
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5905
|
+
|
5906
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
5907
|
+
kq = ggml_scale(ctx, kq, 30);
|
5908
|
+
}
|
5909
|
+
|
5454
5910
|
#if defined(GGML_USE_KOMPUTE)
|
5455
5911
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
5456
5912
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
@@ -5577,7 +6033,8 @@ struct llm_build_context {
|
|
5577
6033
|
const float norm_rms_eps;
|
5578
6034
|
|
5579
6035
|
const int32_t n_tokens;
|
5580
|
-
const int32_t n_kv; // size of KV cache to consider (n_kv <=
|
6036
|
+
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
6037
|
+
const int32_t n_outputs;
|
5581
6038
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5582
6039
|
const int32_t n_orig_ctx;
|
5583
6040
|
|
@@ -5624,6 +6081,7 @@ struct llm_build_context {
|
|
5624
6081
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
5625
6082
|
n_tokens (batch.n_tokens),
|
5626
6083
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
6084
|
+
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
5627
6085
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
5628
6086
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5629
6087
|
pooling_type (cparams.pooling_type),
|
@@ -5645,6 +6103,7 @@ struct llm_build_context {
|
|
5645
6103
|
lctx.inp_tokens = nullptr;
|
5646
6104
|
lctx.inp_embd = nullptr;
|
5647
6105
|
lctx.inp_pos = nullptr;
|
6106
|
+
lctx.inp_out_ids = nullptr;
|
5648
6107
|
lctx.inp_KQ_mask = nullptr;
|
5649
6108
|
lctx.inp_KQ_pos = nullptr;
|
5650
6109
|
lctx.inp_K_shift = nullptr;
|
@@ -5768,6 +6227,13 @@ struct llm_build_context {
|
|
5768
6227
|
return lctx.inp_pos;
|
5769
6228
|
}
|
5770
6229
|
|
6230
|
+
struct ggml_tensor * build_inp_out_ids() {
|
6231
|
+
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
6232
|
+
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
6233
|
+
ggml_set_input(lctx.inp_out_ids);
|
6234
|
+
return lctx.inp_out_ids;
|
6235
|
+
}
|
6236
|
+
|
5771
6237
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
5772
6238
|
if (causal) {
|
5773
6239
|
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
|
@@ -5824,6 +6290,9 @@ struct llm_build_context {
|
|
5824
6290
|
struct ggml_cgraph * build_llama() {
|
5825
6291
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5826
6292
|
|
6293
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6294
|
+
int32_t n_tokens = this->n_tokens;
|
6295
|
+
|
5827
6296
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5828
6297
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5829
6298
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
@@ -5891,6 +6360,14 @@ struct llm_build_context {
|
|
5891
6360
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5892
6361
|
}
|
5893
6362
|
|
6363
|
+
if (il == n_layer - 1) {
|
6364
|
+
// skip computing output for unused tokens
|
6365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6366
|
+
n_tokens = n_outputs;
|
6367
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6368
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6369
|
+
}
|
6370
|
+
|
5894
6371
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5895
6372
|
cb(ffn_inp, "ffn_inp", il);
|
5896
6373
|
|
@@ -5943,19 +6420,19 @@ struct llm_build_context {
|
|
5943
6420
|
for (int i = 0; i < n_expert_used; ++i) {
|
5944
6421
|
ggml_tensor * cur_expert;
|
5945
6422
|
|
5946
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].
|
6423
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
5947
6424
|
cb(cur_up, "ffn_moe_up", il);
|
5948
6425
|
|
5949
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].
|
6426
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
5950
6427
|
cb(cur_gate, "ffn_moe_gate", il);
|
5951
6428
|
|
5952
6429
|
cur_gate = ggml_silu(ctx0, cur_gate);
|
5953
6430
|
cb(cur_gate, "ffn_moe_silu", il);
|
5954
6431
|
|
5955
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6432
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
5956
6433
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
5957
6434
|
|
5958
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].
|
6435
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
5959
6436
|
cb(cur_expert, "ffn_moe_down", il);
|
5960
6437
|
|
5961
6438
|
cur_expert = ggml_mul(ctx0, cur_expert,
|
@@ -6070,6 +6547,13 @@ struct llm_build_context {
|
|
6070
6547
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6071
6548
|
}
|
6072
6549
|
|
6550
|
+
if (il == n_layer - 1) {
|
6551
|
+
// skip computing output for unused tokens
|
6552
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6553
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6554
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6555
|
+
}
|
6556
|
+
|
6073
6557
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6074
6558
|
cb(ffn_inp, "ffn_inp", il);
|
6075
6559
|
|
@@ -6112,6 +6596,111 @@ struct llm_build_context {
|
|
6112
6596
|
return gf;
|
6113
6597
|
}
|
6114
6598
|
|
6599
|
+
struct ggml_cgraph * build_xverse() {
|
6600
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6601
|
+
|
6602
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6603
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6604
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6605
|
+
|
6606
|
+
struct ggml_tensor * cur;
|
6607
|
+
struct ggml_tensor * inpL;
|
6608
|
+
|
6609
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6610
|
+
|
6611
|
+
// inp_pos - contains the positions
|
6612
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6613
|
+
|
6614
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6615
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6616
|
+
|
6617
|
+
// positions of the tokens in the KV cache
|
6618
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6619
|
+
|
6620
|
+
for (int il = 0; il < n_layer; ++il) {
|
6621
|
+
struct ggml_tensor * inpSA = inpL;
|
6622
|
+
|
6623
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6624
|
+
model.layers[il].attn_norm, NULL,
|
6625
|
+
LLM_NORM_RMS, cb, il);
|
6626
|
+
cb(cur, "attn_norm", il);
|
6627
|
+
|
6628
|
+
// self-attention
|
6629
|
+
{
|
6630
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6631
|
+
cb(Qcur, "Qcur", il);
|
6632
|
+
|
6633
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6634
|
+
cb(Kcur, "Kcur", il);
|
6635
|
+
|
6636
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6637
|
+
cb(Vcur, "Vcur", il);
|
6638
|
+
|
6639
|
+
Qcur = ggml_rope_custom(
|
6640
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6641
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6642
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6643
|
+
);
|
6644
|
+
cb(Qcur, "Qcur", il);
|
6645
|
+
|
6646
|
+
Kcur = ggml_rope_custom(
|
6647
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6648
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6649
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6650
|
+
);
|
6651
|
+
cb(Kcur, "Kcur", il);
|
6652
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6653
|
+
model.layers[il].wo, NULL,
|
6654
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6655
|
+
}
|
6656
|
+
|
6657
|
+
if (il == n_layer - 1) {
|
6658
|
+
// skip computing output for unused tokens
|
6659
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6660
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6661
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6662
|
+
}
|
6663
|
+
|
6664
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6665
|
+
cb(ffn_inp, "ffn_inp", il);
|
6666
|
+
|
6667
|
+
// feed-forward network
|
6668
|
+
{
|
6669
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6670
|
+
model.layers[il].ffn_norm, NULL,
|
6671
|
+
LLM_NORM_RMS, cb, il);
|
6672
|
+
cb(cur, "ffn_norm", il);
|
6673
|
+
|
6674
|
+
cur = llm_build_ffn(ctx0, cur,
|
6675
|
+
model.layers[il].ffn_up, NULL,
|
6676
|
+
model.layers[il].ffn_gate, NULL,
|
6677
|
+
model.layers[il].ffn_down, NULL,
|
6678
|
+
NULL,
|
6679
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6680
|
+
cb(cur, "ffn_out", il);
|
6681
|
+
}
|
6682
|
+
|
6683
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6684
|
+
cb(cur, "l_out", il);
|
6685
|
+
|
6686
|
+
// input for next layer
|
6687
|
+
inpL = cur;
|
6688
|
+
}
|
6689
|
+
|
6690
|
+
cur = inpL;
|
6691
|
+
|
6692
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
6693
|
+
cb(cur, "result_norm", -1);
|
6694
|
+
|
6695
|
+
// lm_head
|
6696
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6697
|
+
cb(cur, "result_output", -1);
|
6698
|
+
|
6699
|
+
ggml_build_forward_expand(gf, cur);
|
6700
|
+
|
6701
|
+
return gf;
|
6702
|
+
}
|
6703
|
+
|
6115
6704
|
struct ggml_cgraph * build_falcon() {
|
6116
6705
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6117
6706
|
|
@@ -6185,6 +6774,14 @@ struct llm_build_context {
|
|
6185
6774
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6186
6775
|
}
|
6187
6776
|
|
6777
|
+
if (il == n_layer - 1) {
|
6778
|
+
// skip computing output for unused tokens
|
6779
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6780
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6781
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6782
|
+
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
6783
|
+
}
|
6784
|
+
|
6188
6785
|
struct ggml_tensor * ffn_inp = cur;
|
6189
6786
|
|
6190
6787
|
// feed forward
|
@@ -6225,6 +6822,214 @@ struct llm_build_context {
|
|
6225
6822
|
return gf;
|
6226
6823
|
}
|
6227
6824
|
|
6825
|
+
struct ggml_cgraph * build_grok() {
|
6826
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6827
|
+
|
6828
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6829
|
+
int32_t n_tokens = this->n_tokens;
|
6830
|
+
|
6831
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6832
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6833
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6834
|
+
|
6835
|
+
struct ggml_tensor * cur;
|
6836
|
+
struct ggml_tensor * inpL;
|
6837
|
+
|
6838
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6839
|
+
|
6840
|
+
// multiply by embedding_multiplier_scale of 78.38367176906169
|
6841
|
+
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
|
6842
|
+
|
6843
|
+
// inp_pos - contains the positions
|
6844
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6845
|
+
|
6846
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6847
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6848
|
+
|
6849
|
+
for (int il = 0; il < n_layer; ++il) {
|
6850
|
+
struct ggml_tensor * inpSA = inpL;
|
6851
|
+
|
6852
|
+
// norm
|
6853
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6854
|
+
model.layers[il].attn_norm, NULL,
|
6855
|
+
LLM_NORM_RMS, cb, il);
|
6856
|
+
cb(cur, "attn_norm", il);
|
6857
|
+
|
6858
|
+
|
6859
|
+
// self-attention
|
6860
|
+
{
|
6861
|
+
// compute Q and K and RoPE them
|
6862
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6863
|
+
cb(Qcur, "Qcur", il);
|
6864
|
+
if (model.layers[il].bq) {
|
6865
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6866
|
+
cb(Qcur, "Qcur", il);
|
6867
|
+
}
|
6868
|
+
|
6869
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6870
|
+
cb(Kcur, "Kcur", il);
|
6871
|
+
if (model.layers[il].bk) {
|
6872
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6873
|
+
cb(Kcur, "Kcur", il);
|
6874
|
+
}
|
6875
|
+
|
6876
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6877
|
+
cb(Vcur, "Vcur", il);
|
6878
|
+
if (model.layers[il].bv) {
|
6879
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6880
|
+
cb(Vcur, "Vcur", il);
|
6881
|
+
}
|
6882
|
+
|
6883
|
+
Qcur = ggml_rope_custom(
|
6884
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6885
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6886
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6887
|
+
);
|
6888
|
+
cb(Qcur, "Qcur", il);
|
6889
|
+
|
6890
|
+
Kcur = ggml_rope_custom(
|
6891
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6892
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6893
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6894
|
+
);
|
6895
|
+
cb(Kcur, "Kcur", il);
|
6896
|
+
|
6897
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6898
|
+
model.layers[il].wo, model.layers[il].bo,
|
6899
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6900
|
+
}
|
6901
|
+
|
6902
|
+
if (il == n_layer - 1) {
|
6903
|
+
// skip computing output for unused tokens
|
6904
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6905
|
+
n_tokens = n_outputs;
|
6906
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6907
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6908
|
+
}
|
6909
|
+
|
6910
|
+
// Grok
|
6911
|
+
// if attn_out_norm is present then apply it before adding the input
|
6912
|
+
if (model.layers[il].attn_out_norm) {
|
6913
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6914
|
+
model.layers[il].attn_out_norm, NULL,
|
6915
|
+
LLM_NORM_RMS, cb, il);
|
6916
|
+
cb(cur, "attn_out_norm", il);
|
6917
|
+
}
|
6918
|
+
|
6919
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6920
|
+
cb(ffn_inp, "ffn_inp", il);
|
6921
|
+
|
6922
|
+
// feed-forward network
|
6923
|
+
// MoE branch
|
6924
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6925
|
+
model.layers[il].ffn_norm, NULL,
|
6926
|
+
LLM_NORM_RMS, cb, il);
|
6927
|
+
cb(cur, "ffn_norm", il);
|
6928
|
+
|
6929
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
6930
|
+
cb(logits, "ffn_moe_logits", il);
|
6931
|
+
|
6932
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
6933
|
+
cb(probs, "ffn_moe_probs", il);
|
6934
|
+
|
6935
|
+
// select experts
|
6936
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
6937
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6938
|
+
|
6939
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6940
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6941
|
+
cb(weights, "ffn_moe_weights", il);
|
6942
|
+
|
6943
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6944
|
+
|
6945
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6946
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6947
|
+
|
6948
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6949
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6950
|
+
|
6951
|
+
// compute expert outputs
|
6952
|
+
ggml_tensor * moe_out = nullptr;
|
6953
|
+
|
6954
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6955
|
+
ggml_tensor * cur_expert;
|
6956
|
+
|
6957
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6958
|
+
cb(cur_up, "ffn_moe_up", il);
|
6959
|
+
|
6960
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6961
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
6962
|
+
|
6963
|
+
//GeLU
|
6964
|
+
cur_gate = ggml_gelu(ctx0, cur_gate);
|
6965
|
+
cb(cur_gate, "ffn_moe_gelu", il);
|
6966
|
+
|
6967
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6968
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
6969
|
+
|
6970
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6971
|
+
cb(cur_expert, "ffn_moe_down", il);
|
6972
|
+
|
6973
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
6974
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6975
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
6976
|
+
|
6977
|
+
if (i == 0) {
|
6978
|
+
moe_out = cur_expert;
|
6979
|
+
} else {
|
6980
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6981
|
+
cb(moe_out, "ffn_moe_out", il);
|
6982
|
+
}
|
6983
|
+
}
|
6984
|
+
|
6985
|
+
cur = moe_out;
|
6986
|
+
|
6987
|
+
// Grok
|
6988
|
+
// if layer_out_norm is present then apply it before adding the input
|
6989
|
+
// Idea: maybe ffn_out_norm is a better name
|
6990
|
+
if (model.layers[il].layer_out_norm) {
|
6991
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6992
|
+
model.layers[il].layer_out_norm, NULL,
|
6993
|
+
LLM_NORM_RMS, cb, il);
|
6994
|
+
cb(cur, "layer_out_norm", il);
|
6995
|
+
}
|
6996
|
+
|
6997
|
+
|
6998
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6999
|
+
cb(cur, "ffn_out", il);
|
7000
|
+
|
7001
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7002
|
+
if (layer_dir != nullptr) {
|
7003
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7004
|
+
}
|
7005
|
+
cb(cur, "l_out", il);
|
7006
|
+
|
7007
|
+
// input for next layer
|
7008
|
+
inpL = cur;
|
7009
|
+
}
|
7010
|
+
|
7011
|
+
cur = inpL;
|
7012
|
+
|
7013
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7014
|
+
model.output_norm, NULL,
|
7015
|
+
LLM_NORM_RMS, cb, -1);
|
7016
|
+
cb(cur, "result_norm", -1);
|
7017
|
+
|
7018
|
+
// lm_head
|
7019
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7020
|
+
|
7021
|
+
// Grok
|
7022
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7023
|
+
|
7024
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7025
|
+
|
7026
|
+
cb(cur, "result_output", -1);
|
7027
|
+
|
7028
|
+
ggml_build_forward_expand(gf, cur);
|
7029
|
+
|
7030
|
+
return gf;
|
7031
|
+
}
|
7032
|
+
|
6228
7033
|
struct ggml_cgraph * build_starcoder() {
|
6229
7034
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6230
7035
|
|
@@ -6279,6 +7084,13 @@ struct llm_build_context {
|
|
6279
7084
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6280
7085
|
}
|
6281
7086
|
|
7087
|
+
if (il == n_layer - 1) {
|
7088
|
+
// skip computing output for unused tokens
|
7089
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7090
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7091
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7092
|
+
}
|
7093
|
+
|
6282
7094
|
// add the input
|
6283
7095
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6284
7096
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6476,6 +7288,13 @@ struct llm_build_context {
|
|
6476
7288
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6477
7289
|
}
|
6478
7290
|
|
7291
|
+
if (il == n_layer - 1) {
|
7292
|
+
// skip computing output for unused tokens
|
7293
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7294
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7295
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
7296
|
+
}
|
7297
|
+
|
6479
7298
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
6480
7299
|
cb(ffn_inp, "ffn_inp", il);
|
6481
7300
|
|
@@ -6565,6 +7384,13 @@ struct llm_build_context {
|
|
6565
7384
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6566
7385
|
}
|
6567
7386
|
|
7387
|
+
if (il == n_layer - 1) {
|
7388
|
+
// skip computing output for unused tokens
|
7389
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7390
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7391
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7392
|
+
}
|
7393
|
+
|
6568
7394
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6569
7395
|
cb(ffn_inp, "ffn_inp", il);
|
6570
7396
|
|
@@ -6722,6 +7548,13 @@ struct llm_build_context {
|
|
6722
7548
|
}
|
6723
7549
|
cb(cur, "kqv_out", il);
|
6724
7550
|
|
7551
|
+
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
7552
|
+
// skip computing output for unused tokens
|
7553
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7554
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7555
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7556
|
+
}
|
7557
|
+
|
6725
7558
|
// re-add the layer input
|
6726
7559
|
cur = ggml_add(ctx0, cur, inpL);
|
6727
7560
|
|
@@ -6844,6 +7677,13 @@ struct llm_build_context {
|
|
6844
7677
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6845
7678
|
}
|
6846
7679
|
|
7680
|
+
if (il == n_layer - 1) {
|
7681
|
+
// skip computing output for unused tokens
|
7682
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7683
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7684
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7685
|
+
}
|
7686
|
+
|
6847
7687
|
// Add the input
|
6848
7688
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6849
7689
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6891,6 +7731,7 @@ struct llm_build_context {
|
|
6891
7731
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6892
7732
|
|
6893
7733
|
struct ggml_tensor * cur;
|
7734
|
+
struct ggml_tensor * pos;
|
6894
7735
|
struct ggml_tensor * inpL;
|
6895
7736
|
|
6896
7737
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -6901,6 +7742,16 @@ struct llm_build_context {
|
|
6901
7742
|
// positions of the tokens in the KV cache
|
6902
7743
|
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6903
7744
|
|
7745
|
+
if (model.pos_embd) {
|
7746
|
+
// inp_pos - contains the positions
|
7747
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7748
|
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7749
|
+
cb(pos, "pos_embd", -1);
|
7750
|
+
|
7751
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7752
|
+
cb(inpL, "inpL", -1);
|
7753
|
+
}
|
7754
|
+
|
6904
7755
|
for (int il = 0; il < n_layer; ++il) {
|
6905
7756
|
struct ggml_tensor * attn_norm;
|
6906
7757
|
|
@@ -6935,11 +7786,39 @@ struct llm_build_context {
|
|
6935
7786
|
cb(Kcur, "Kcur", il);
|
6936
7787
|
cb(Vcur, "Vcur", il);
|
6937
7788
|
|
6938
|
-
|
7789
|
+
// Q/K Layernorm
|
7790
|
+
if (model.layers[il].attn_q_norm) {
|
7791
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
7792
|
+
model.layers[il].attn_q_norm,
|
7793
|
+
model.layers[il].attn_q_norm_b,
|
7794
|
+
LLM_NORM, cb, il);
|
7795
|
+
cb(Qcur, "Qcur", il);
|
6939
7796
|
|
6940
|
-
|
7797
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
7798
|
+
model.layers[il].attn_k_norm,
|
7799
|
+
model.layers[il].attn_k_norm_b,
|
7800
|
+
LLM_NORM, cb, il);
|
7801
|
+
cb(Kcur, "Kcur", il);
|
7802
|
+
|
7803
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7804
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7805
|
+
|
7806
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6941
7807
|
model.layers[il].wo, model.layers[il].bo,
|
6942
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7808
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7809
|
+
} else {
|
7810
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7811
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7812
|
+
model.layers[il].wo, model.layers[il].bo,
|
7813
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7814
|
+
}
|
7815
|
+
}
|
7816
|
+
|
7817
|
+
if (il == n_layer - 1) {
|
7818
|
+
// skip computing output for unused tokens
|
7819
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7820
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7821
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6943
7822
|
}
|
6944
7823
|
|
6945
7824
|
// Add the input
|
@@ -7055,6 +7934,13 @@ struct llm_build_context {
|
|
7055
7934
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7056
7935
|
}
|
7057
7936
|
|
7937
|
+
if (il == n_layer - 1) {
|
7938
|
+
// skip computing output for unused tokens
|
7939
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7940
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7941
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7942
|
+
}
|
7943
|
+
|
7058
7944
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7059
7945
|
cb(ffn_inp, "ffn_inp", il);
|
7060
7946
|
|
@@ -7161,6 +8047,13 @@ struct llm_build_context {
|
|
7161
8047
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7162
8048
|
}
|
7163
8049
|
|
8050
|
+
if (il == n_layer - 1) {
|
8051
|
+
// skip computing output for unused tokens
|
8052
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8053
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8054
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8055
|
+
}
|
8056
|
+
|
7164
8057
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7165
8058
|
cb(ffn_inp, "ffn_inp", il);
|
7166
8059
|
|
@@ -7273,6 +8166,13 @@ struct llm_build_context {
|
|
7273
8166
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7274
8167
|
}
|
7275
8168
|
|
8169
|
+
if (il == n_layer - 1) {
|
8170
|
+
// skip computing output for unused tokens
|
8171
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8172
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8173
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8174
|
+
}
|
8175
|
+
|
7276
8176
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7277
8177
|
cb(ffn_inp, "ffn_inp", il);
|
7278
8178
|
|
@@ -7391,6 +8291,14 @@ struct llm_build_context {
|
|
7391
8291
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7392
8292
|
}
|
7393
8293
|
|
8294
|
+
if (il == n_layer - 1) {
|
8295
|
+
// skip computing output for unused tokens
|
8296
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8297
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8298
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8299
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
8300
|
+
}
|
8301
|
+
|
7394
8302
|
// FF
|
7395
8303
|
{
|
7396
8304
|
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
@@ -7488,6 +8396,14 @@ struct llm_build_context {
|
|
7488
8396
|
|
7489
8397
|
cur = attention_norm;
|
7490
8398
|
|
8399
|
+
if (il == n_layer - 1) {
|
8400
|
+
// skip computing output for unused tokens
|
8401
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8402
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8403
|
+
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
|
8404
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8405
|
+
}
|
8406
|
+
|
7491
8407
|
// feed-forward network
|
7492
8408
|
{
|
7493
8409
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -7580,6 +8496,13 @@ struct llm_build_context {
|
|
7580
8496
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7581
8497
|
}
|
7582
8498
|
|
8499
|
+
if (il == n_layer - 1) {
|
8500
|
+
// skip computing output for unused tokens
|
8501
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8502
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8503
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8504
|
+
}
|
8505
|
+
|
7583
8506
|
// add the input
|
7584
8507
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7585
8508
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7680,6 +8603,13 @@ struct llm_build_context {
|
|
7680
8603
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7681
8604
|
}
|
7682
8605
|
|
8606
|
+
if (il == n_layer - 1) {
|
8607
|
+
// skip computing output for unused tokens
|
8608
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8609
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8610
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8611
|
+
}
|
8612
|
+
|
7683
8613
|
// add the input
|
7684
8614
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7685
8615
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7789,6 +8719,13 @@ struct llm_build_context {
|
|
7789
8719
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7790
8720
|
}
|
7791
8721
|
|
8722
|
+
if (il == n_layer - 1) {
|
8723
|
+
// skip computing output for unused tokens
|
8724
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8725
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8726
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8727
|
+
}
|
8728
|
+
|
7792
8729
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7793
8730
|
cb(ffn_inp, "ffn_inp", il);
|
7794
8731
|
|
@@ -7899,6 +8836,13 @@ struct llm_build_context {
|
|
7899
8836
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7900
8837
|
}
|
7901
8838
|
|
8839
|
+
if (il == n_layer - 1) {
|
8840
|
+
// skip computing output for unused tokens
|
8841
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8842
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8843
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8844
|
+
}
|
8845
|
+
|
7902
8846
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7903
8847
|
cb(ffn_inp, "ffn_inp", il);
|
7904
8848
|
|
@@ -8022,6 +8966,13 @@ struct llm_build_context {
|
|
8022
8966
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8023
8967
|
}
|
8024
8968
|
|
8969
|
+
if (il == n_layer - 1) {
|
8970
|
+
// skip computing output for unused tokens
|
8971
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8972
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8973
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8974
|
+
}
|
8975
|
+
|
8025
8976
|
// scale_res - scale the hidden states for residual connection
|
8026
8977
|
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
8027
8978
|
cur = ggml_scale(ctx0, cur, scale_res);
|
@@ -8136,6 +9087,13 @@ struct llm_build_context {
|
|
8136
9087
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8137
9088
|
}
|
8138
9089
|
|
9090
|
+
if (il == n_layer - 1) {
|
9091
|
+
// skip computing output for unused tokens
|
9092
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9093
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9094
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9095
|
+
}
|
9096
|
+
|
8139
9097
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
8140
9098
|
cb(sa_out, "sa_out", il);
|
8141
9099
|
|
@@ -8248,6 +9206,13 @@ struct llm_build_context {
|
|
8248
9206
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8249
9207
|
}
|
8250
9208
|
|
9209
|
+
if (il == n_layer - 1) {
|
9210
|
+
// skip computing output for unused tokens
|
9211
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9212
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9213
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
9214
|
+
}
|
9215
|
+
|
8251
9216
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8252
9217
|
cb(ffn_inp, "ffn_inp", il);
|
8253
9218
|
|
@@ -8395,6 +9360,15 @@ struct llm_build_context {
|
|
8395
9360
|
|
8396
9361
|
struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
|
8397
9362
|
|
9363
|
+
if (il == n_layer - 1) {
|
9364
|
+
// skip computing output for unused tokens
|
9365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9366
|
+
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
9367
|
+
y = ggml_get_rows(ctx0, y, inp_out_ids);
|
9368
|
+
z = ggml_get_rows(ctx0, z, inp_out_ids);
|
9369
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9370
|
+
}
|
9371
|
+
|
8398
9372
|
// {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
|
8399
9373
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
8400
9374
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
|
@@ -8497,6 +9471,14 @@ struct llm_build_context {
|
|
8497
9471
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8498
9472
|
}
|
8499
9473
|
|
9474
|
+
if (il == n_layer - 1) {
|
9475
|
+
// skip computing output for unused tokens
|
9476
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9477
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9478
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9479
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
9480
|
+
}
|
9481
|
+
|
8500
9482
|
struct ggml_tensor * attn_out = cur;
|
8501
9483
|
|
8502
9484
|
// feed-forward network
|
@@ -8648,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8648
9630
|
{
|
8649
9631
|
result = llm.build_falcon();
|
8650
9632
|
} break;
|
9633
|
+
case LLM_ARCH_GROK:
|
9634
|
+
{
|
9635
|
+
result = llm.build_grok();
|
9636
|
+
} break;
|
8651
9637
|
case LLM_ARCH_STARCODER:
|
8652
9638
|
{
|
8653
9639
|
result = llm.build_starcoder();
|
@@ -8725,6 +9711,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8725
9711
|
{
|
8726
9712
|
result = llm.build_mamba();
|
8727
9713
|
} break;
|
9714
|
+
case LLM_ARCH_XVERSE:
|
9715
|
+
{
|
9716
|
+
result = llm.build_xverse();
|
9717
|
+
} break;
|
8728
9718
|
case LLM_ARCH_COMMAND_R:
|
8729
9719
|
{
|
8730
9720
|
result = llm.build_command_r();
|
@@ -8790,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8790
9780
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
8791
9781
|
}
|
8792
9782
|
|
9783
|
+
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
9784
|
+
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
9785
|
+
const int64_t n_tokens = batch.n_tokens;
|
9786
|
+
|
9787
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
9788
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
9789
|
+
|
9790
|
+
if (lctx.n_outputs == n_tokens) {
|
9791
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9792
|
+
data[i] = i;
|
9793
|
+
}
|
9794
|
+
} else if (batch.logits) {
|
9795
|
+
int32_t n_outputs = 0;
|
9796
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9797
|
+
if (batch.logits[i]) {
|
9798
|
+
data[n_outputs++] = i;
|
9799
|
+
}
|
9800
|
+
}
|
9801
|
+
// the graph needs to have been passed the correct number of outputs
|
9802
|
+
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
9803
|
+
} else if (lctx.n_outputs == 1) {
|
9804
|
+
// only keep last output
|
9805
|
+
data[0] = n_tokens - 1;
|
9806
|
+
} else {
|
9807
|
+
GGML_ASSERT(lctx.n_outputs == 0);
|
9808
|
+
}
|
9809
|
+
}
|
9810
|
+
|
8793
9811
|
GGML_ASSERT(
|
9812
|
+
// (!a || b) is a logical implication (a -> b)
|
9813
|
+
// !hparams.causal_attn -> !cparams.causal_attn
|
8794
9814
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
8795
|
-
"
|
9815
|
+
"causal attention with embedding models is not supported"
|
8796
9816
|
);
|
8797
9817
|
|
8798
9818
|
if (lctx.inp_KQ_mask) {
|
@@ -8971,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8971
9991
|
}
|
8972
9992
|
}
|
8973
9993
|
|
9994
|
+
// Make sure enough space is available for outputs.
|
9995
|
+
// Returns max number of outputs for which space was reserved.
|
9996
|
+
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
9997
|
+
const auto & cparams = lctx.cparams;
|
9998
|
+
const auto & hparams = lctx.model.hparams;
|
9999
|
+
|
10000
|
+
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
10001
|
+
|
10002
|
+
const auto n_batch = cparams.n_batch;
|
10003
|
+
const auto n_vocab = hparams.n_vocab;
|
10004
|
+
const auto n_embd = hparams.n_embd;
|
10005
|
+
|
10006
|
+
// TODO: use a per-batch flag for logits presence instead
|
10007
|
+
const bool has_logits = cparams.causal_attn;
|
10008
|
+
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
10009
|
+
|
10010
|
+
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
10011
|
+
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
10012
|
+
|
10013
|
+
if (lctx.output_ids.empty()) {
|
10014
|
+
// init, never resized afterwards
|
10015
|
+
lctx.output_ids.resize(n_batch);
|
10016
|
+
}
|
10017
|
+
|
10018
|
+
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
|
10019
|
+
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
10020
|
+
|
10021
|
+
// alloc only when more than the current capacity is required
|
10022
|
+
// TODO: also consider shrinking the buffer
|
10023
|
+
if (!lctx.buf_output || prev_size < new_size) {
|
10024
|
+
if (lctx.buf_output) {
|
10025
|
+
#ifndef NDEBUG
|
10026
|
+
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
10027
|
+
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
10028
|
+
#endif
|
10029
|
+
ggml_backend_buffer_free(lctx.buf_output);
|
10030
|
+
lctx.buf_output = nullptr;
|
10031
|
+
lctx.logits = nullptr;
|
10032
|
+
lctx.embd = nullptr;
|
10033
|
+
}
|
10034
|
+
|
10035
|
+
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
10036
|
+
if (lctx.buf_output == nullptr) {
|
10037
|
+
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
10038
|
+
return 0;
|
10039
|
+
}
|
10040
|
+
}
|
10041
|
+
|
10042
|
+
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
|
10043
|
+
|
10044
|
+
lctx.logits = has_logits ? output_base : nullptr;
|
10045
|
+
lctx.embd = has_embd ? output_base + logits_size : nullptr;
|
10046
|
+
|
10047
|
+
lctx.output_size = n_outputs_max;
|
10048
|
+
lctx.logits_size = logits_size;
|
10049
|
+
lctx.embd_size = embd_size;
|
10050
|
+
|
10051
|
+
// set all ids as invalid (negative)
|
10052
|
+
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
|
10053
|
+
|
10054
|
+
ggml_backend_buffer_clear(lctx.buf_output, 0);
|
10055
|
+
|
10056
|
+
lctx.n_outputs = 0;
|
10057
|
+
|
10058
|
+
return n_outputs_max;
|
10059
|
+
}
|
10060
|
+
|
10061
|
+
|
8974
10062
|
static void llama_graph_compute(
|
8975
10063
|
llama_context & lctx,
|
8976
10064
|
ggml_cgraph * gf,
|
@@ -9046,16 +10134,8 @@ static int llama_decode_internal(
|
|
9046
10134
|
const int64_t n_embd = hparams.n_embd;
|
9047
10135
|
const int64_t n_vocab = hparams.n_vocab;
|
9048
10136
|
|
9049
|
-
|
9050
|
-
|
9051
|
-
|
9052
|
-
#ifndef NDEBUG
|
9053
|
-
auto & logits_valid = lctx.logits_valid;
|
9054
|
-
logits_valid.clear();
|
9055
|
-
logits_valid.resize(n_tokens_all);
|
9056
|
-
|
9057
|
-
memset(logits_out, 0, lctx.logits_size*sizeof(float));
|
9058
|
-
#endif
|
10137
|
+
uint32_t n_outputs = 0;
|
10138
|
+
uint32_t n_outputs_prev = 0;
|
9059
10139
|
|
9060
10140
|
const auto n_ubatch = cparams.n_ubatch;
|
9061
10141
|
|
@@ -9064,6 +10144,38 @@ static int llama_decode_internal(
|
|
9064
10144
|
std::vector<llama_seq_id *> seq_id_arr;
|
9065
10145
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
9066
10146
|
|
10147
|
+
// count outputs
|
10148
|
+
if (batch_all.logits) {
|
10149
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10150
|
+
n_outputs += batch_all.logits[i] != 0;
|
10151
|
+
}
|
10152
|
+
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
10153
|
+
n_outputs = n_tokens_all;
|
10154
|
+
} else {
|
10155
|
+
// keep last output only
|
10156
|
+
n_outputs = 1;
|
10157
|
+
}
|
10158
|
+
|
10159
|
+
// reserve output buffer
|
10160
|
+
if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
|
10161
|
+
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
|
10162
|
+
return -2;
|
10163
|
+
};
|
10164
|
+
|
10165
|
+
// set output mappings
|
10166
|
+
if (batch_all.logits) {
|
10167
|
+
int32_t i_logits = 0;
|
10168
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10169
|
+
if (batch_all.logits[i]) {
|
10170
|
+
lctx.output_ids[i] = i_logits++;
|
10171
|
+
}
|
10172
|
+
}
|
10173
|
+
} else {
|
10174
|
+
for (uint32_t i = 0; i < n_outputs; ++i) {
|
10175
|
+
lctx.output_ids[i] = i;
|
10176
|
+
}
|
10177
|
+
}
|
10178
|
+
|
9067
10179
|
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
9068
10180
|
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
9069
10181
|
llama_batch u_batch = {
|
@@ -9079,6 +10191,27 @@ static int llama_decode_internal(
|
|
9079
10191
|
/* .all_seq_id = */ batch_all.all_seq_id,
|
9080
10192
|
};
|
9081
10193
|
|
10194
|
+
// count the outputs in this u_batch
|
10195
|
+
{
|
10196
|
+
int32_t n_outputs_new = 0;
|
10197
|
+
|
10198
|
+
if (u_batch.logits) {
|
10199
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
10200
|
+
n_outputs_new += u_batch.logits[i] != 0;
|
10201
|
+
}
|
10202
|
+
} else if (n_outputs == n_tokens_all) {
|
10203
|
+
n_outputs_new = n_tokens;
|
10204
|
+
} else {
|
10205
|
+
// keep last output only
|
10206
|
+
if (cur_token + n_tokens >= n_tokens_all) {
|
10207
|
+
n_outputs_new = 1;
|
10208
|
+
}
|
10209
|
+
}
|
10210
|
+
|
10211
|
+
// needs to happen before the graph is built
|
10212
|
+
lctx.n_outputs = n_outputs_new;
|
10213
|
+
}
|
10214
|
+
|
9082
10215
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
9083
10216
|
GGML_ASSERT(n_threads > 0);
|
9084
10217
|
|
@@ -9142,23 +10275,37 @@ static int llama_decode_internal(
|
|
9142
10275
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
9143
10276
|
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
9144
10277
|
|
9145
|
-
if (
|
10278
|
+
if (lctx.n_outputs == 0) {
|
10279
|
+
// no output
|
10280
|
+
res = nullptr;
|
10281
|
+
embd = nullptr;
|
10282
|
+
} else if (!hparams.causal_attn) {
|
9146
10283
|
res = nullptr; // do not extract logits for embedding models such as BERT
|
9147
10284
|
|
9148
10285
|
// token or sequence embeddings
|
9149
10286
|
embd = gf->nodes[gf->n_nodes - 1];
|
9150
10287
|
|
9151
10288
|
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
9152
|
-
} else {
|
9153
|
-
|
9154
|
-
|
9155
|
-
|
9156
|
-
|
9157
|
-
|
9158
|
-
|
9159
|
-
}
|
9160
|
-
|
10289
|
+
} else if (cparams.embeddings) {
|
10290
|
+
// the embeddings could be in the second to last tensor, or any of the previous tensors
|
10291
|
+
int i_embd = gf->n_nodes - 2;
|
10292
|
+
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
|
10293
|
+
i_embd = gf->n_nodes - i;
|
10294
|
+
if (i_embd < 0) { break; }
|
10295
|
+
embd = gf->nodes[i_embd];
|
10296
|
+
}
|
10297
|
+
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
10298
|
+
|
10299
|
+
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
10300
|
+
if (!cparams.causal_attn) {
|
10301
|
+
res = nullptr; // do not extract logits when not needed
|
10302
|
+
// skip computing logits
|
10303
|
+
// TODO: is this safe?
|
10304
|
+
gf->n_nodes = i_embd + 1;
|
9161
10305
|
}
|
10306
|
+
} else {
|
10307
|
+
embd = nullptr; // do not extract embeddings when not needed
|
10308
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
9162
10309
|
}
|
9163
10310
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
9164
10311
|
|
@@ -9201,50 +10348,23 @@ static int llama_decode_internal(
|
|
9201
10348
|
//}
|
9202
10349
|
|
9203
10350
|
// extract logits
|
9204
|
-
// TODO: do not compute and extract logits if only embeddings are needed
|
9205
|
-
// update the graphs to skip "result_output" if logits are not needed
|
9206
10351
|
if (res) {
|
9207
10352
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
9208
10353
|
GGML_ASSERT(backend_res != nullptr);
|
9209
|
-
|
9210
|
-
|
9211
|
-
|
9212
|
-
|
9213
|
-
|
9214
|
-
|
9215
|
-
|
9216
|
-
|
9217
|
-
|
9218
|
-
// extract logits for the range [i_first, i_last)
|
9219
|
-
// group the requests to minimize the number of calls to the backend
|
9220
|
-
ggml_backend_tensor_get_async(backend_res, res,
|
9221
|
-
logits_out + n_vocab*(cur_token + i_first),
|
9222
|
-
i_first*n_vocab*sizeof(float),
|
9223
|
-
(i_last - i_first)*n_vocab*sizeof(float));
|
9224
|
-
i_first = -1;
|
9225
|
-
}
|
9226
|
-
}
|
9227
|
-
#ifndef NDEBUG
|
9228
|
-
logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
|
9229
|
-
#endif
|
9230
|
-
}
|
9231
|
-
} else if (lctx.logits_all) {
|
9232
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
|
9233
|
-
#ifndef NDEBUG
|
9234
|
-
std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
|
9235
|
-
#endif
|
9236
|
-
} else {
|
9237
|
-
if (cur_token + n_tokens >= n_tokens_all) {
|
9238
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
|
9239
|
-
#ifndef NDEBUG
|
9240
|
-
logits_valid[0] = true;
|
9241
|
-
#endif
|
9242
|
-
}
|
10354
|
+
GGML_ASSERT(lctx.logits != nullptr);
|
10355
|
+
|
10356
|
+
float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
|
10357
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10358
|
+
|
10359
|
+
if (n_outputs_new) {
|
10360
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10361
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
|
10362
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
|
9243
10363
|
}
|
9244
10364
|
}
|
9245
10365
|
|
9246
10366
|
// extract embeddings
|
9247
|
-
if (
|
10367
|
+
if (embd) {
|
9248
10368
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
9249
10369
|
GGML_ASSERT(backend_embd != nullptr);
|
9250
10370
|
|
@@ -9252,16 +10372,14 @@ static int llama_decode_internal(
|
|
9252
10372
|
case LLAMA_POOLING_TYPE_NONE:
|
9253
10373
|
{
|
9254
10374
|
// extract token embeddings
|
9255
|
-
|
9256
|
-
|
9257
|
-
|
9258
|
-
|
9259
|
-
|
9260
|
-
|
9261
|
-
|
9262
|
-
|
9263
|
-
ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
9264
|
-
}
|
10375
|
+
GGML_ASSERT(lctx.embd != nullptr);
|
10376
|
+
float * embd_out = lctx.embd + n_outputs_prev*n_embd;
|
10377
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10378
|
+
|
10379
|
+
if (n_outputs_new) {
|
10380
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10381
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
|
10382
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
9265
10383
|
}
|
9266
10384
|
} break;
|
9267
10385
|
case LLAMA_POOLING_TYPE_CLS:
|
@@ -9288,6 +10406,7 @@ static int llama_decode_internal(
|
|
9288
10406
|
} break;
|
9289
10407
|
}
|
9290
10408
|
}
|
10409
|
+
n_outputs_prev += lctx.n_outputs;
|
9291
10410
|
}
|
9292
10411
|
|
9293
10412
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
@@ -10218,7 +11337,7 @@ struct llm_tokenizer_wpm {
|
|
10218
11337
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
10219
11338
|
continue;
|
10220
11339
|
}
|
10221
|
-
code =
|
11340
|
+
code = unicode_tolower(code);
|
10222
11341
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
10223
11342
|
code = ' ';
|
10224
11343
|
}
|
@@ -10238,7 +11357,7 @@ struct llm_tokenizer_wpm {
|
|
10238
11357
|
std::vector<std::string> words;
|
10239
11358
|
while (r < new_str.size()) {
|
10240
11359
|
// if is whitespace
|
10241
|
-
if (isspace(new_str[r])) {
|
11360
|
+
if (isspace(new_str[r], std::locale::classic())) {
|
10242
11361
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
10243
11362
|
l = r + 1;
|
10244
11363
|
r = l;
|
@@ -10252,18 +11371,12 @@ struct llm_tokenizer_wpm {
|
|
10252
11371
|
return words;
|
10253
11372
|
}
|
10254
11373
|
|
10255
|
-
uint32_t to_lower(uint32_t code) {
|
10256
|
-
static const std::locale locale("en_US.UTF-8");
|
10257
|
-
#if defined(_WIN32)
|
10258
|
-
if (code > 0xFFFF) {
|
10259
|
-
return code;
|
10260
|
-
}
|
10261
|
-
#endif
|
10262
|
-
return std::tolower(wchar_t(code), locale);
|
10263
|
-
}
|
10264
|
-
|
10265
11374
|
bool is_ascii_punct(uint32_t code) {
|
10266
|
-
|
11375
|
+
if (code > 0xFF) {
|
11376
|
+
return false;
|
11377
|
+
}
|
11378
|
+
auto c = char(static_cast<unsigned char>(code));
|
11379
|
+
return ispunct(c, std::locale::classic());
|
10267
11380
|
}
|
10268
11381
|
|
10269
11382
|
bool is_chinese_char(uint32_t cpt) {
|
@@ -10508,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10508
11621
|
// grammar - internal
|
10509
11622
|
//
|
10510
11623
|
|
10511
|
-
struct llama_partial_utf8 {
|
10512
|
-
uint32_t value; // bit value so far (unshifted)
|
10513
|
-
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
10514
|
-
};
|
10515
|
-
|
10516
|
-
struct llama_grammar {
|
10517
|
-
const std::vector<std::vector<llama_grammar_element>> rules;
|
10518
|
-
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
10519
|
-
|
10520
|
-
// buffer for partially generated UTF-8 sequence from accepted tokens
|
10521
|
-
llama_partial_utf8 partial_utf8;
|
10522
|
-
};
|
10523
|
-
|
10524
|
-
struct llama_grammar_candidate {
|
10525
|
-
size_t index;
|
10526
|
-
const uint32_t * code_points;
|
10527
|
-
llama_partial_utf8 partial_utf8;
|
10528
|
-
};
|
10529
11624
|
|
10530
11625
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
10531
11626
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
10532
|
-
|
11627
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
10533
11628
|
const std::string & src,
|
10534
11629
|
llama_partial_utf8 partial_start) {
|
10535
11630
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -10731,7 +11826,7 @@ static void llama_grammar_advance_stack(
|
|
10731
11826
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
10732
11827
|
// produces the N possible stacks if the given char is accepted at those
|
10733
11828
|
// positions
|
10734
|
-
|
11829
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
10735
11830
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
10736
11831
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
10737
11832
|
const uint32_t chr) {
|
@@ -11957,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11957
13052
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
11958
13053
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
11959
13054
|
// tensor name.
|
11960
|
-
n_layer /= n_expert;
|
11961
13055
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
11962
13056
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
11963
13057
|
}
|
@@ -11971,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11971
13065
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
11972
13066
|
// with the quantization of the output tensor
|
11973
13067
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
11974
|
-
|
11975
|
-
|
11976
|
-
|
11977
|
-
|
11978
|
-
|
11979
|
-
|
11980
|
-
|
11981
|
-
|
11982
|
-
|
11983
|
-
|
13068
|
+
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
13069
|
+
new_type = qs.params->output_tensor_type;
|
13070
|
+
} else {
|
13071
|
+
int nx = tensor->ne[0];
|
13072
|
+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
13073
|
+
new_type = GGML_TYPE_Q8_0;
|
13074
|
+
}
|
13075
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
13076
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
13077
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13078
|
+
new_type = GGML_TYPE_Q5_K;
|
13079
|
+
}
|
13080
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
13081
|
+
new_type = GGML_TYPE_Q6_K;
|
13082
|
+
}
|
11984
13083
|
}
|
11985
13084
|
} else if (name == "token_embd.weight") {
|
11986
|
-
if (
|
11987
|
-
|
11988
|
-
|
11989
|
-
|
11990
|
-
|
11991
|
-
|
11992
|
-
|
11993
|
-
|
11994
|
-
|
13085
|
+
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
13086
|
+
new_type = qs.params->token_embedding_type;
|
13087
|
+
} else {
|
13088
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
13089
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13090
|
+
new_type = GGML_TYPE_Q2_K;
|
13091
|
+
}
|
13092
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13093
|
+
new_type = GGML_TYPE_IQ3_S;
|
13094
|
+
}
|
13095
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
13096
|
+
new_type = GGML_TYPE_IQ3_S;
|
13097
|
+
}
|
11995
13098
|
}
|
11996
13099
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
11997
|
-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13100
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
11998
13101
|
if (name.find("attn_v.weight") != std::string::npos) {
|
11999
13102
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
12000
13103
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
@@ -12013,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12013
13116
|
if (qs.model.hparams.n_expert == 8) {
|
12014
13117
|
new_type = GGML_TYPE_Q5_K;
|
12015
13118
|
} else {
|
12016
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
13119
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
|
12017
13120
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
12018
13121
|
}
|
12019
13122
|
}
|
@@ -12027,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12027
13130
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
12028
13131
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
12029
13132
|
}
|
12030
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
12031
|
-
new_type = GGML_TYPE_Q4_K;
|
12032
|
-
}
|
12033
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
12034
|
-
new_type = GGML_TYPE_Q4_K;
|
12035
|
-
}
|
12036
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
13133
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
12037
13134
|
new_type = GGML_TYPE_Q4_K;
|
12038
13135
|
}
|
12039
13136
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
@@ -12186,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12186
13283
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
12187
13284
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
12188
13285
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
12189
|
-
new_type == GGML_TYPE_IQ3_XXS ||
|
13286
|
+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
13287
|
+
new_type == GGML_TYPE_IQ1_M) {
|
12190
13288
|
int nx = tensor->ne[0];
|
12191
13289
|
int ny = tensor->ne[1];
|
12192
13290
|
if (nx % QK_K != 0) {
|
@@ -12204,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12204
13302
|
case GGML_TYPE_IQ3_XXS:
|
12205
13303
|
case GGML_TYPE_IQ3_S:
|
12206
13304
|
case GGML_TYPE_IQ1_S:
|
13305
|
+
case GGML_TYPE_IQ1_M:
|
12207
13306
|
case GGML_TYPE_Q2_K:
|
12208
13307
|
case GGML_TYPE_Q3_K:
|
12209
13308
|
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
@@ -12285,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12285
13384
|
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
|
12286
13385
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
|
12287
13386
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
|
13387
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
|
12288
13388
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
|
12289
13389
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
12290
13390
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
@@ -12307,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12307
13407
|
constexpr bool use_mmap = false;
|
12308
13408
|
#endif
|
12309
13409
|
|
12310
|
-
|
12311
|
-
|
13410
|
+
llama_model_kv_override * kv_overrides = nullptr;
|
13411
|
+
if (params->kv_overrides) {
|
13412
|
+
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
13413
|
+
kv_overrides = v->data();
|
13414
|
+
}
|
13415
|
+
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
13416
|
+
ml.init_mappings(false); // no prefetching
|
12312
13417
|
|
12313
13418
|
llama_model model;
|
12314
13419
|
llm_load_arch(ml, model);
|
@@ -12332,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12332
13437
|
struct gguf_context * ctx_out = gguf_init_empty();
|
12333
13438
|
|
12334
13439
|
// copy the KV pairs from the input file
|
12335
|
-
gguf_set_kv (ctx_out, ml.
|
13440
|
+
gguf_set_kv (ctx_out, ml.meta);
|
12336
13441
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
12337
13442
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
12338
13443
|
|
13444
|
+
if (params->kv_overrides) {
|
13445
|
+
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
13446
|
+
for (auto & o : overrides) {
|
13447
|
+
if (o.key[0] == 0) break;
|
13448
|
+
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
13449
|
+
gguf_set_val_f32(ctx_out, o.key, o.float_value);
|
13450
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
13451
|
+
gguf_set_val_i32(ctx_out, o.key, o.int_value);
|
13452
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
13453
|
+
gguf_set_val_bool(ctx_out, o.key, o.bool_value);
|
13454
|
+
} else {
|
13455
|
+
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
13456
|
+
}
|
13457
|
+
}
|
13458
|
+
}
|
13459
|
+
|
12339
13460
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12340
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13461
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12341
13462
|
|
12342
13463
|
const std::string name = ggml_get_name(meta);
|
12343
13464
|
|
12344
13465
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
12345
13466
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
12346
13467
|
++qs.n_attention_wv;
|
12347
|
-
}
|
12348
|
-
else if (name.find("ffn_down") != std::string::npos) {
|
12349
|
-
++qs.n_ffn_down;
|
12350
|
-
}
|
12351
|
-
else if (name.find("ffn_gate") != std::string::npos) {
|
12352
|
-
++qs.n_ffn_gate;
|
12353
|
-
}
|
12354
|
-
else if (name.find("ffn_up") != std::string::npos) {
|
12355
|
-
++qs.n_ffn_up;
|
12356
|
-
}
|
12357
|
-
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13468
|
+
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
12358
13469
|
qs.has_output = true;
|
12359
13470
|
}
|
12360
13471
|
}
|
12361
|
-
|
12362
|
-
|
12363
|
-
|
12364
|
-
|
13472
|
+
|
13473
|
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
|
+
|
13475
|
+
// sanity checks
|
13476
|
+
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
12365
13477
|
|
12366
13478
|
size_t total_size_org = 0;
|
12367
13479
|
size_t total_size_new = 0;
|
@@ -12377,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12377
13489
|
|
12378
13490
|
// populate the original tensors so we get an initial meta data
|
12379
13491
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12380
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13492
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12381
13493
|
gguf_add_tensor(ctx_out, meta);
|
12382
13494
|
}
|
12383
13495
|
|
@@ -12391,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12391
13503
|
// placeholder for the meta data
|
12392
13504
|
::zeros(fout, meta_size);
|
12393
13505
|
|
13506
|
+
const auto tn = LLM_TN(model.arch);
|
13507
|
+
|
12394
13508
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12395
13509
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
12396
13510
|
|
@@ -12413,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12413
13527
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
12414
13528
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
12415
13529
|
|
12416
|
-
// quantize only 2D tensors
|
12417
|
-
quantize &= (ggml_n_dims(tensor)
|
13530
|
+
// quantize only 2D and 3D tensors (experts)
|
13531
|
+
quantize &= (ggml_n_dims(tensor) >= 2);
|
12418
13532
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
12419
13533
|
quantize &= !params->only_copy;
|
12420
13534
|
|
@@ -12443,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12443
13557
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
12444
13558
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
12445
13559
|
}
|
13560
|
+
else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
|
+
new_type = params->token_embedding_type;
|
13562
|
+
}
|
13563
|
+
else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
|
+
new_type = params->output_tensor_type;
|
13565
|
+
}
|
12446
13566
|
|
12447
13567
|
// If we've decided to quantize to the same type the tensor is already
|
12448
13568
|
// in then there's nothing to do.
|
@@ -12463,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12463
13583
|
if (it == imatrix_data->end()) {
|
12464
13584
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
12465
13585
|
} else {
|
12466
|
-
if (it->second.size() == (size_t)tensor->ne[0]) {
|
13586
|
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
12467
13587
|
imatrix = it->second.data();
|
12468
13588
|
} else {
|
12469
13589
|
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
12470
|
-
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
13590
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
13591
|
+
|
13592
|
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
13593
|
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
13594
|
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
13595
|
+
// tok_embd should be ignored in this case, since it always causes this warning
|
13596
|
+
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
13597
|
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
13598
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
13599
|
+
}
|
12471
13600
|
}
|
12472
13601
|
}
|
12473
13602
|
}
|
@@ -12475,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12475
13604
|
new_type == GGML_TYPE_IQ2_XS ||
|
12476
13605
|
new_type == GGML_TYPE_IQ2_S ||
|
12477
13606
|
new_type == GGML_TYPE_IQ1_S ||
|
13607
|
+
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
12478
13608
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
12479
13609
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
12480
13610
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -12503,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12503
13633
|
new_data = work.data();
|
12504
13634
|
|
12505
13635
|
const int n_per_row = tensor->ne[0];
|
12506
|
-
const int nrows =
|
13636
|
+
const int nrows = tensor->ne[1];
|
12507
13637
|
|
12508
13638
|
static const int min_chunk_size = 32 * 512;
|
12509
13639
|
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
12510
13640
|
|
12511
|
-
const int
|
13641
|
+
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
13642
|
+
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
12512
13643
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
12513
|
-
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
|
12514
13644
|
|
13645
|
+
// quantize each expert separately since they have different importance matrices
|
13646
|
+
new_size = 0;
|
13647
|
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
13648
|
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
13649
|
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
13650
|
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
13651
|
+
|
13652
|
+
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
13653
|
+
}
|
12515
13654
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
12516
13655
|
}
|
12517
13656
|
total_size_org += ggml_nbytes(tensor);
|
@@ -12582,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12582
13721
|
if (path_base_model) {
|
12583
13722
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
12584
13723
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
12585
|
-
ml->
|
13724
|
+
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
12586
13725
|
}
|
12587
13726
|
|
12588
13727
|
struct tensor_meta {
|
@@ -12703,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12703
13842
|
|
12704
13843
|
ggml_tensor * base_t;
|
12705
13844
|
if (ml) {
|
12706
|
-
if (
|
13845
|
+
if (!ml->get_tensor_meta(base_name.c_str())) {
|
12707
13846
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
12708
13847
|
return 1;
|
12709
13848
|
}
|
@@ -12887,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12887
14026
|
struct llama_model_quantize_params result = {
|
12888
14027
|
/*.nthread =*/ 0,
|
12889
14028
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
14029
|
+
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
14030
|
+
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
12890
14031
|
/*.allow_requantize =*/ false,
|
12891
14032
|
/*.quantize_output_tensor =*/ true,
|
12892
14033
|
/*.only_copy =*/ false,
|
12893
14034
|
/*.pure =*/ false,
|
12894
14035
|
/*.imatrix =*/ nullptr,
|
14036
|
+
/*.kv_overrides =*/ nullptr,
|
12895
14037
|
};
|
12896
14038
|
|
12897
14039
|
return result;
|
@@ -12900,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12900
14042
|
size_t llama_max_devices(void) {
|
12901
14043
|
#if defined(GGML_USE_METAL)
|
12902
14044
|
return 1;
|
12903
|
-
#elif defined(
|
14045
|
+
#elif defined(GGML_USE_CUDA)
|
12904
14046
|
return GGML_CUDA_MAX_DEVICES;
|
12905
14047
|
#elif defined(GGML_USE_SYCL)
|
12906
14048
|
return GGML_SYCL_MAX_DEVICES;
|
@@ -12920,8 +14062,8 @@ bool llama_supports_mlock(void) {
|
|
12920
14062
|
}
|
12921
14063
|
|
12922
14064
|
bool llama_supports_gpu_offload(void) {
|
12923
|
-
#if defined(
|
12924
|
-
defined(GGML_USE_SYCL)
|
14065
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
14066
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
12925
14067
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
12926
14068
|
return true;
|
12927
14069
|
#else
|
@@ -13028,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13028
14170
|
const auto & hparams = model->hparams;
|
13029
14171
|
auto & cparams = ctx->cparams;
|
13030
14172
|
|
13031
|
-
|
14173
|
+
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
13032
14174
|
cparams.n_threads = params.n_threads;
|
13033
14175
|
cparams.n_threads_batch = params.n_threads_batch;
|
13034
14176
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
@@ -13126,7 +14268,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13126
14268
|
}
|
13127
14269
|
ctx->backends.push_back(ctx->backend_metal);
|
13128
14270
|
}
|
13129
|
-
#elif defined(
|
14271
|
+
#elif defined(GGML_USE_CUDA)
|
13130
14272
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
13131
14273
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
13132
14274
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
@@ -13149,7 +14291,20 @@ struct llama_context * llama_new_context_with_model(
|
|
13149
14291
|
}
|
13150
14292
|
}
|
13151
14293
|
#elif defined(GGML_USE_VULKAN)
|
13152
|
-
if (model->
|
14294
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14295
|
+
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
14296
|
+
llama_free(ctx);
|
14297
|
+
return nullptr;
|
14298
|
+
}
|
14299
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
14300
|
+
ggml_backend_t backend = ggml_backend_vk_init(0);
|
14301
|
+
if (backend == nullptr) {
|
14302
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
14303
|
+
llama_free(ctx);
|
14304
|
+
return nullptr;
|
14305
|
+
}
|
14306
|
+
ctx->backends.push_back(backend);
|
14307
|
+
} else {
|
13153
14308
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
13154
14309
|
ggml_backend_t backend = ggml_backend_vk_init(device);
|
13155
14310
|
if (backend == nullptr) {
|
@@ -13161,30 +14316,28 @@ struct llama_context * llama_new_context_with_model(
|
|
13161
14316
|
}
|
13162
14317
|
}
|
13163
14318
|
#elif defined(GGML_USE_SYCL)
|
13164
|
-
|
13165
|
-
|
13166
|
-
|
13167
|
-
|
14319
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
14320
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14321
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
14322
|
+
if (backend == nullptr) {
|
14323
|
+
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
14324
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
14325
|
+
llama_free(ctx);
|
14326
|
+
return nullptr;
|
14327
|
+
}
|
14328
|
+
ctx->backends.push_back(backend);
|
14329
|
+
} else {
|
14330
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
14331
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
14332
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
13168
14333
|
if (backend == nullptr) {
|
13169
|
-
int
|
13170
|
-
|
14334
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
14335
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
14336
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
13171
14337
|
llama_free(ctx);
|
13172
14338
|
return nullptr;
|
13173
14339
|
}
|
13174
14340
|
ctx->backends.push_back(backend);
|
13175
|
-
} else {
|
13176
|
-
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
13177
|
-
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
13178
|
-
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
13179
|
-
if (backend == nullptr) {
|
13180
|
-
int id_list[GGML_SYCL_MAX_DEVICES];
|
13181
|
-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
13182
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
13183
|
-
llama_free(ctx);
|
13184
|
-
return nullptr;
|
13185
|
-
}
|
13186
|
-
ctx->backends.push_back(backend);
|
13187
|
-
}
|
13188
14341
|
}
|
13189
14342
|
}
|
13190
14343
|
#elif defined(GGML_USE_KOMPUTE)
|
@@ -13232,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
|
|
13232
14385
|
|
13233
14386
|
// graph outputs buffer
|
13234
14387
|
{
|
13235
|
-
// resized during inference
|
13236
|
-
ctx
|
13237
|
-
|
13238
|
-
|
13239
|
-
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
|
13240
|
-
|
13241
|
-
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
|
13242
|
-
if (ctx->buf_output == nullptr) {
|
13243
|
-
LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
|
14388
|
+
// resized during inference when a batch uses more outputs
|
14389
|
+
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
14390
|
+
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
13244
14391
|
llama_free(ctx);
|
13245
14392
|
return nullptr;
|
13246
14393
|
}
|
13247
|
-
ggml_backend_buffer_clear(ctx->buf_output, 0);
|
13248
|
-
|
13249
|
-
|
13250
|
-
ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
|
13251
|
-
if (params.embeddings) {
|
13252
|
-
ctx->embd = ctx->logits + ctx->logits_size;
|
13253
|
-
}
|
13254
14394
|
|
13255
14395
|
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
|
13256
14396
|
ggml_backend_buffer_name(ctx->buf_output),
|
@@ -13275,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13275
14415
|
|
13276
14416
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
13277
14417
|
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
13278
|
-
#ifndef
|
14418
|
+
#ifndef GGML_USE_CUDA
|
13279
14419
|
// pipeline parallelism requires support for async compute and events
|
13280
14420
|
// currently this is only implemented in the CUDA backend
|
13281
14421
|
pipeline_parallel = false;
|
@@ -13383,11 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
13383
14523
|
case LLM_ARCH_ORION:
|
13384
14524
|
case LLM_ARCH_INTERNLM2:
|
13385
14525
|
case LLM_ARCH_MINICPM:
|
14526
|
+
case LLM_ARCH_XVERSE:
|
13386
14527
|
case LLM_ARCH_COMMAND_R:
|
13387
14528
|
return LLAMA_ROPE_TYPE_NORM;
|
13388
14529
|
|
13389
14530
|
// the pairs of head values are offset by n_rot/2
|
13390
14531
|
case LLM_ARCH_FALCON:
|
14532
|
+
case LLM_ARCH_GROK:
|
13391
14533
|
case LLM_ARCH_PERSIMMON:
|
13392
14534
|
case LLM_ARCH_BERT:
|
13393
14535
|
case LLM_ARCH_NOMIC_BERT:
|
@@ -13766,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
13766
14908
|
|
13767
14909
|
// Returns the *maximum* size of the state
|
13768
14910
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
14911
|
+
const auto & cparams = ctx->cparams;
|
14912
|
+
const auto & hparams = ctx->model.hparams;
|
14913
|
+
|
13769
14914
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
13770
14915
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
13771
14916
|
const size_t s_rng_size = sizeof(size_t);
|
13772
14917
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
14918
|
+
const size_t s_n_outputs = sizeof(size_t);
|
14919
|
+
// assume worst case for outputs although only currently set ones are serialized
|
14920
|
+
const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
|
13773
14921
|
const size_t s_logits_size = sizeof(size_t);
|
13774
|
-
|
13775
|
-
const size_t s_logits = ctx->logits_size * sizeof(float);
|
14922
|
+
const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
|
13776
14923
|
const size_t s_embedding_size = sizeof(size_t);
|
13777
|
-
const size_t s_embedding = ctx->embd_size * sizeof(float);
|
14924
|
+
const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
|
13778
14925
|
const size_t s_kv_buf_size = sizeof(size_t);
|
13779
14926
|
const size_t s_kv_head = sizeof(uint32_t);
|
13780
14927
|
const size_t s_kv_size = sizeof(uint32_t);
|
13781
14928
|
const size_t s_kv_used = sizeof(uint32_t);
|
13782
14929
|
const size_t s_kv = ctx->kv_self.total_size();
|
13783
|
-
|
13784
|
-
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
14930
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
13785
14931
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
13786
14932
|
|
13787
14933
|
const size_t s_total = (
|
13788
14934
|
+ s_rng_size
|
13789
14935
|
+ s_rng
|
14936
|
+
+ s_n_outputs
|
14937
|
+
+ s_output_pos
|
13790
14938
|
+ s_logits_size
|
13791
14939
|
+ s_logits
|
13792
14940
|
+ s_embedding_size
|
@@ -13861,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13861
15009
|
std::ostringstream rng_ss;
|
13862
15010
|
rng_ss << ctx->rng;
|
13863
15011
|
|
13864
|
-
const std::string & rng_str
|
15012
|
+
const std::string & rng_str = rng_ss.str();
|
13865
15013
|
const size_t rng_size = rng_str.size();
|
13866
15014
|
|
13867
15015
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
@@ -13870,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13870
15018
|
data_ctx->write(rng_str.data(), rng_size);
|
13871
15019
|
}
|
13872
15020
|
|
13873
|
-
// copy
|
15021
|
+
// copy outputs
|
13874
15022
|
{
|
13875
|
-
|
15023
|
+
// Can't use ctx->n_outputs because it's not for the
|
15024
|
+
// entire last batch when n_ubatch is smaller than n_batch
|
15025
|
+
size_t n_outputs = 0;
|
13876
15026
|
|
13877
|
-
|
15027
|
+
// copy output ids
|
15028
|
+
{
|
15029
|
+
std::vector<int32_t> output_pos;
|
13878
15030
|
|
13879
|
-
|
13880
|
-
|
15031
|
+
const size_t n_batch = ctx->cparams.n_batch;
|
15032
|
+
const auto & output_ids = ctx->output_ids;
|
15033
|
+
|
15034
|
+
output_pos.resize(ctx->output_size);
|
15035
|
+
|
15036
|
+
// build a more compact representation of the output ids
|
15037
|
+
for (size_t i = 0; i < n_batch; ++i) {
|
15038
|
+
// map an output id to a position in the batch
|
15039
|
+
int32_t pos = output_ids[i];
|
15040
|
+
if (pos >= 0) {
|
15041
|
+
if ((size_t) pos >= n_outputs) {
|
15042
|
+
n_outputs = pos + 1;
|
15043
|
+
}
|
15044
|
+
GGML_ASSERT((size_t) pos < ctx->output_size);
|
15045
|
+
output_pos[pos] = i;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
data_ctx->write(&n_outputs, sizeof(n_outputs));
|
15050
|
+
|
15051
|
+
if (n_outputs) {
|
15052
|
+
data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
|
15053
|
+
}
|
13881
15054
|
}
|
13882
|
-
}
|
13883
15055
|
|
13884
|
-
|
13885
|
-
|
13886
|
-
|
15056
|
+
// copy logits
|
15057
|
+
{
|
15058
|
+
const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
|
13887
15059
|
|
13888
|
-
|
15060
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
13889
15061
|
|
13890
|
-
|
13891
|
-
|
15062
|
+
if (logits_size) {
|
15063
|
+
data_ctx->write(ctx->logits, logits_size * sizeof(float));
|
15064
|
+
}
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
// copy embeddings
|
15068
|
+
{
|
15069
|
+
const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
|
15070
|
+
|
15071
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
15072
|
+
|
15073
|
+
if (embeddings_size) {
|
15074
|
+
data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
|
15075
|
+
}
|
13892
15076
|
}
|
13893
15077
|
}
|
13894
15078
|
|
@@ -13901,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13901
15085
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13902
15086
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
13903
15087
|
|
13904
|
-
|
15088
|
+
// NOTE: kv_size and kv_buf_size are mostly used for sanity checks
|
13905
15089
|
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
13906
15090
|
const uint32_t kv_size = kv_self.size;
|
15091
|
+
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
13907
15092
|
const uint32_t kv_used = kv_self.used;
|
13908
15093
|
|
13909
15094
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
@@ -13912,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13912
15097
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
13913
15098
|
|
13914
15099
|
if (kv_buf_size) {
|
15100
|
+
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
15101
|
+
|
13915
15102
|
std::vector<uint8_t> tmp_buf;
|
13916
15103
|
for (int il = 0; il < (int) n_layer; ++il) {
|
13917
15104
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -13941,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13941
15128
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
13942
15129
|
}
|
13943
15130
|
}
|
15131
|
+
GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
|
13944
15132
|
}
|
13945
15133
|
|
13946
15134
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
@@ -13985,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13985
15173
|
GGML_ASSERT(!rng_ss.fail());
|
13986
15174
|
}
|
13987
15175
|
|
15176
|
+
// set output ids
|
15177
|
+
{
|
15178
|
+
size_t n_outputs;
|
15179
|
+
std::vector<int32_t> output_pos;
|
15180
|
+
|
15181
|
+
memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
|
15182
|
+
|
15183
|
+
GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
|
15184
|
+
|
15185
|
+
if (n_outputs) {
|
15186
|
+
output_pos.resize(n_outputs);
|
15187
|
+
memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
|
15188
|
+
inp += n_outputs * sizeof(int32_t);
|
15189
|
+
|
15190
|
+
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
|
15191
|
+
int32_t id = output_pos[i];
|
15192
|
+
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15193
|
+
ctx->output_ids[id] = i;
|
15194
|
+
}
|
15195
|
+
}
|
15196
|
+
}
|
15197
|
+
|
13988
15198
|
// set logits
|
13989
15199
|
{
|
13990
15200
|
size_t logits_size;
|
@@ -14005,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14005
15215
|
|
14006
15216
|
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
14007
15217
|
|
14008
|
-
GGML_ASSERT(ctx->embd_size
|
15218
|
+
GGML_ASSERT(ctx->embd_size >= embeddings_size);
|
14009
15219
|
|
14010
15220
|
if (embeddings_size) {
|
14011
15221
|
memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
|
@@ -14032,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14032
15242
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
14033
15243
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
14034
15244
|
|
15245
|
+
if (kv_self.size != kv_size) {
|
15246
|
+
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
15247
|
+
GGML_ASSERT(kv_self.size >= kv_head);
|
15248
|
+
|
15249
|
+
LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
|
15250
|
+
__func__, kv_head, kv_size, kv_self.size);
|
15251
|
+
}
|
15252
|
+
|
14035
15253
|
if (kv_buf_size) {
|
14036
|
-
|
15254
|
+
const size_t pre_kv_buf_size = inp - src;
|
15255
|
+
|
15256
|
+
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
|
14037
15257
|
|
14038
15258
|
for (int il = 0; il < (int) n_layer; ++il) {
|
14039
15259
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -14053,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14053
15273
|
|
14054
15274
|
// v is not contiguous, copy row by row
|
14055
15275
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
14056
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
15276
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
|
14057
15277
|
|
14058
15278
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
14059
15279
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
14060
15280
|
inp += v_row_size;
|
14061
15281
|
}
|
14062
15282
|
}
|
15283
|
+
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
14063
15284
|
}
|
14064
15285
|
|
14065
|
-
|
15286
|
+
llama_kv_cache_clear(ctx);
|
14066
15287
|
|
14067
15288
|
ctx->kv_self.head = kv_head;
|
14068
|
-
ctx->kv_self.size = kv_size;
|
14069
15289
|
ctx->kv_self.used = kv_used;
|
14070
15290
|
|
14071
|
-
ctx->kv_self.cells.resize(kv_size);
|
14072
|
-
|
14073
15291
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
14074
15292
|
llama_pos pos;
|
14075
15293
|
size_t seq_id_size;
|
@@ -14086,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14086
15304
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
14087
15305
|
}
|
14088
15306
|
}
|
14089
|
-
|
14090
|
-
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
14091
|
-
ctx->kv_self.cells[i].pos = -1;
|
14092
|
-
ctx->kv_self.cells[i].seq_id.clear();
|
14093
|
-
}
|
14094
15307
|
}
|
14095
15308
|
|
14096
15309
|
const size_t nread = inp - src;
|
@@ -14296,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
14296
15509
|
}
|
14297
15510
|
|
14298
15511
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
14299
|
-
assert(ctx->logits_valid.at(i));
|
14300
|
-
|
14301
15512
|
llama_synchronize(ctx);
|
14302
15513
|
|
14303
|
-
|
15514
|
+
try {
|
15515
|
+
if (ctx->logits == nullptr) {
|
15516
|
+
throw std::runtime_error("no logits");
|
15517
|
+
}
|
15518
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15519
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15520
|
+
}
|
15521
|
+
const int32_t j = ctx->output_ids[i];
|
15522
|
+
|
15523
|
+
if (j < 0) {
|
15524
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
|
+
}
|
15526
|
+
if ((size_t) j >= ctx->output_size) {
|
15527
|
+
// This should not happen
|
15528
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15529
|
+
}
|
15530
|
+
|
15531
|
+
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
15532
|
+
} catch (const std::exception & err) {
|
15533
|
+
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
15534
|
+
#ifndef NDEBUG
|
15535
|
+
GGML_ASSERT(false);
|
15536
|
+
#endif
|
15537
|
+
return nullptr;
|
15538
|
+
}
|
14304
15539
|
}
|
14305
15540
|
|
14306
15541
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
@@ -14312,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
14312
15547
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
14313
15548
|
llama_synchronize(ctx);
|
14314
15549
|
|
14315
|
-
|
15550
|
+
try {
|
15551
|
+
if (ctx->embd == nullptr) {
|
15552
|
+
throw std::runtime_error("no embeddings");
|
15553
|
+
}
|
15554
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15555
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15556
|
+
}
|
15557
|
+
const int32_t j = ctx->output_ids[i];
|
15558
|
+
|
15559
|
+
if (j < 0) {
|
15560
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
|
+
}
|
15562
|
+
if ((size_t) j >= ctx->output_size) {
|
15563
|
+
// This should not happen
|
15564
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15565
|
+
}
|
15566
|
+
|
15567
|
+
return ctx->embd + j*ctx->model.hparams.n_embd;
|
15568
|
+
} catch (const std::exception & err) {
|
15569
|
+
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
15570
|
+
#ifndef NDEBUG
|
15571
|
+
GGML_ASSERT(false);
|
15572
|
+
#endif
|
15573
|
+
return nullptr;
|
15574
|
+
}
|
14316
15575
|
}
|
14317
15576
|
|
14318
15577
|
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
@@ -14602,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
|
|
14602
15861
|
ss << message->content << "</s>";
|
14603
15862
|
}
|
14604
15863
|
}
|
15864
|
+
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
|
15865
|
+
// openchat/openchat-3.5-0106,
|
15866
|
+
for (auto message : chat) {
|
15867
|
+
std::string role(message->role);
|
15868
|
+
if (role == "system") {
|
15869
|
+
ss << message->content << "<|end_of_turn|>";
|
15870
|
+
} else {
|
15871
|
+
role[0] = toupper(role[0]);
|
15872
|
+
ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
|
15873
|
+
}
|
15874
|
+
}
|
15875
|
+
if (add_ass) {
|
15876
|
+
ss << "GPT4 Correct Assistant:";
|
15877
|
+
}
|
15878
|
+
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
|
15879
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
15880
|
+
for (auto message : chat) {
|
15881
|
+
std::string role(message->role);
|
15882
|
+
if (role == "system") {
|
15883
|
+
// Orca-Vicuna variant uses a system prefix
|
15884
|
+
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
|
15885
|
+
ss << "SYSTEM: " << message->content << "\n";
|
15886
|
+
} else {
|
15887
|
+
ss << message->content << "\n\n";
|
15888
|
+
}
|
15889
|
+
} else if (role == "user") {
|
15890
|
+
ss << "USER: " << message->content << "\n";
|
15891
|
+
} else if (role == "assistant") {
|
15892
|
+
ss << "ASSISTANT: " << message->content << "</s>\n";
|
15893
|
+
}
|
15894
|
+
}
|
15895
|
+
if (add_ass) {
|
15896
|
+
ss << "ASSISTANT:";
|
15897
|
+
}
|
15898
|
+
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
|
15899
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
15900
|
+
for (auto message : chat) {
|
15901
|
+
std::string role(message->role);
|
15902
|
+
if (role == "system") {
|
15903
|
+
ss << message->content;
|
15904
|
+
} else if (role == "user") {
|
15905
|
+
ss << "### Instruction:\n" << message->content << "\n";
|
15906
|
+
} else if (role == "assistant") {
|
15907
|
+
ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
|
15908
|
+
}
|
15909
|
+
}
|
15910
|
+
if (add_ass) {
|
15911
|
+
ss << "### Response:\n";
|
15912
|
+
}
|
14605
15913
|
} else {
|
14606
15914
|
// template not supported
|
14607
15915
|
return -1;
|
@@ -14651,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
14651
15959
|
return res;
|
14652
15960
|
}
|
14653
15961
|
|
15962
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
15963
|
+
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
15964
|
+
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
15965
|
+
return strlen(split_path);
|
15966
|
+
}
|
15967
|
+
return 0;
|
15968
|
+
}
|
15969
|
+
|
15970
|
+
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
15971
|
+
std::string str_split_path(split_path);
|
15972
|
+
char postfix[32];
|
15973
|
+
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
15974
|
+
std::string str_postfix(postfix);
|
15975
|
+
|
15976
|
+
// check if dest ends with postfix
|
15977
|
+
int size_prefix = str_split_path.size() - str_postfix.size();
|
15978
|
+
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
15979
|
+
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
15980
|
+
return size_prefix;
|
15981
|
+
}
|
15982
|
+
|
15983
|
+
return 0;
|
15984
|
+
}
|
15985
|
+
|
14654
15986
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
14655
15987
|
struct llama_timings result = {
|
14656
15988
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|