llama_cpp 0.14.3 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "ggml-alloc.h"
|
8
8
|
#include "ggml-backend.h"
|
9
9
|
|
10
|
-
#ifdef
|
10
|
+
#ifdef GGML_USE_CUDA
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
@@ -52,12 +52,16 @@
|
|
52
52
|
#define NOMINMAX
|
53
53
|
#endif
|
54
54
|
#include <windows.h>
|
55
|
+
#ifndef PATH_MAX
|
56
|
+
#define PATH_MAX MAX_PATH
|
57
|
+
#endif
|
55
58
|
#include <io.h>
|
56
59
|
#endif
|
57
60
|
|
58
61
|
#include <algorithm>
|
59
62
|
#include <array>
|
60
63
|
#include <cassert>
|
64
|
+
#include <cctype>
|
61
65
|
#include <cfloat>
|
62
66
|
#include <cinttypes>
|
63
67
|
#include <climits>
|
@@ -68,7 +72,6 @@
|
|
68
72
|
#include <cstdio>
|
69
73
|
#include <cstring>
|
70
74
|
#include <ctime>
|
71
|
-
#include <cwctype>
|
72
75
|
#include <forward_list>
|
73
76
|
#include <fstream>
|
74
77
|
#include <functional>
|
@@ -192,6 +195,7 @@ enum llm_arch {
|
|
192
195
|
LLM_ARCH_LLAMA,
|
193
196
|
LLM_ARCH_FALCON,
|
194
197
|
LLM_ARCH_BAICHUAN,
|
198
|
+
LLM_ARCH_GROK,
|
195
199
|
LLM_ARCH_GPT2,
|
196
200
|
LLM_ARCH_GPTJ,
|
197
201
|
LLM_ARCH_GPTNEOX,
|
@@ -214,6 +218,7 @@ enum llm_arch {
|
|
214
218
|
LLM_ARCH_GEMMA,
|
215
219
|
LLM_ARCH_STARCODER2,
|
216
220
|
LLM_ARCH_MAMBA,
|
221
|
+
LLM_ARCH_XVERSE,
|
217
222
|
LLM_ARCH_COMMAND_R,
|
218
223
|
LLM_ARCH_UNKNOWN,
|
219
224
|
};
|
@@ -221,6 +226,7 @@ enum llm_arch {
|
|
221
226
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
222
227
|
{ LLM_ARCH_LLAMA, "llama" },
|
223
228
|
{ LLM_ARCH_FALCON, "falcon" },
|
229
|
+
{ LLM_ARCH_GROK, "grok" },
|
224
230
|
{ LLM_ARCH_GPT2, "gpt2" },
|
225
231
|
{ LLM_ARCH_GPTJ, "gptj" },
|
226
232
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
@@ -244,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
244
250
|
{ LLM_ARCH_GEMMA, "gemma" },
|
245
251
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
246
252
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
247
254
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
248
255
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
249
256
|
};
|
@@ -290,6 +297,10 @@ enum llm_kv {
|
|
290
297
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
291
298
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
292
299
|
|
300
|
+
LLM_KV_SPLIT_NO,
|
301
|
+
LLM_KV_SPLIT_COUNT,
|
302
|
+
LLM_KV_SPLIT_TENSORS_COUNT,
|
303
|
+
|
293
304
|
LLM_KV_SSM_INNER_SIZE,
|
294
305
|
LLM_KV_SSM_CONV_KERNEL,
|
295
306
|
LLM_KV_SSM_STATE_SIZE,
|
@@ -355,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
355
366
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
356
367
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
357
368
|
|
369
|
+
{ LLM_KV_SPLIT_NO, "split.no" },
|
370
|
+
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
371
|
+
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
|
372
|
+
|
358
373
|
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
359
374
|
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
360
375
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
@@ -411,9 +426,12 @@ enum llm_tensor {
|
|
411
426
|
LLM_TENSOR_FFN_DOWN,
|
412
427
|
LLM_TENSOR_FFN_UP,
|
413
428
|
LLM_TENSOR_FFN_ACT,
|
414
|
-
LLM_TENSOR_FFN_DOWN_EXP,
|
429
|
+
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
415
430
|
LLM_TENSOR_FFN_GATE_EXP,
|
416
431
|
LLM_TENSOR_FFN_UP_EXP,
|
432
|
+
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
433
|
+
LLM_TENSOR_FFN_GATE_EXPS,
|
434
|
+
LLM_TENSOR_FFN_UP_EXPS,
|
417
435
|
LLM_TENSOR_ATTN_Q_NORM,
|
418
436
|
LLM_TENSOR_ATTN_K_NORM,
|
419
437
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -448,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
448
466
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
449
467
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
450
468
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
469
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
470
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
471
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
451
472
|
},
|
452
473
|
},
|
453
474
|
{
|
@@ -483,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
483
504
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
484
505
|
},
|
485
506
|
},
|
507
|
+
{
|
508
|
+
LLM_ARCH_GROK,
|
509
|
+
{
|
510
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
511
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
512
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
513
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
514
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
515
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
516
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
517
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
518
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
519
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
520
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
521
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
522
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
523
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
524
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
525
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
526
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
527
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
528
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
529
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
530
|
+
},
|
531
|
+
},
|
486
532
|
{
|
487
533
|
LLM_ARCH_GPT2,
|
488
534
|
{
|
@@ -548,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
548
594
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
549
595
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
550
596
|
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
597
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
598
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
599
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
551
600
|
},
|
552
601
|
},
|
553
602
|
{
|
@@ -843,6 +892,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
843
892
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
844
893
|
},
|
845
894
|
},
|
895
|
+
{
|
896
|
+
LLM_ARCH_XVERSE,
|
897
|
+
{
|
898
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
899
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
900
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
901
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
902
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
903
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
904
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
905
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
906
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
907
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
908
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
909
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
910
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
911
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
912
|
+
},
|
913
|
+
},
|
846
914
|
{
|
847
915
|
LLM_ARCH_COMMAND_R,
|
848
916
|
{
|
@@ -1030,7 +1098,7 @@ struct llama_file {
|
|
1030
1098
|
size_t size;
|
1031
1099
|
|
1032
1100
|
llama_file(const char * fname, const char * mode) {
|
1033
|
-
fp =
|
1101
|
+
fp = ggml_fopen(fname, mode);
|
1034
1102
|
if (fp == NULL) {
|
1035
1103
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1036
1104
|
}
|
@@ -1099,6 +1167,7 @@ struct llama_file {
|
|
1099
1167
|
}
|
1100
1168
|
}
|
1101
1169
|
};
|
1170
|
+
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1102
1171
|
|
1103
1172
|
struct llama_mmap {
|
1104
1173
|
void * addr;
|
@@ -1299,6 +1368,7 @@ struct llama_mmap {
|
|
1299
1368
|
}
|
1300
1369
|
#endif
|
1301
1370
|
};
|
1371
|
+
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
1302
1372
|
|
1303
1373
|
// Represents some region of memory being locked using mlock or VirtualLock;
|
1304
1374
|
// will automatically unlock on destruction.
|
@@ -1448,6 +1518,7 @@ struct llama_mlock {
|
|
1448
1518
|
static void raw_unlock(const void * addr, size_t len) {}
|
1449
1519
|
#endif
|
1450
1520
|
};
|
1521
|
+
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1451
1522
|
|
1452
1523
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1453
1524
|
std::vector<char> result(8, 0);
|
@@ -1467,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1467
1538
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
1468
1539
|
ggml_backend_buffer_type_t buft = nullptr;
|
1469
1540
|
|
1470
|
-
#if defined(
|
1541
|
+
#if defined(GGML_USE_CUDA)
|
1471
1542
|
// host buffers should only be used when data is expected to be copied to/from the GPU
|
1472
1543
|
if (host_buffer) {
|
1473
1544
|
buft = ggml_backend_cuda_host_buffer_type();
|
@@ -1497,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1497
1568
|
|
1498
1569
|
#ifdef GGML_USE_METAL
|
1499
1570
|
buft = ggml_backend_metal_buffer_type();
|
1500
|
-
#elif defined(
|
1571
|
+
#elif defined(GGML_USE_CUDA)
|
1501
1572
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1502
1573
|
#elif defined(GGML_USE_VULKAN)
|
1503
1574
|
buft = ggml_backend_vk_buffer_type(gpu);
|
@@ -1523,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1523
1594
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1524
1595
|
ggml_backend_buffer_type_t buft = nullptr;
|
1525
1596
|
|
1526
|
-
#ifdef
|
1597
|
+
#ifdef GGML_USE_CUDA
|
1527
1598
|
if (ggml_backend_cuda_get_device_count() > 1) {
|
1528
1599
|
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1529
1600
|
}
|
@@ -1544,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1544
1615
|
}
|
1545
1616
|
|
1546
1617
|
static size_t llama_get_device_count() {
|
1547
|
-
#if defined(
|
1618
|
+
#if defined(GGML_USE_CUDA)
|
1548
1619
|
return ggml_backend_cuda_get_device_count();
|
1549
1620
|
#elif defined(GGML_USE_SYCL)
|
1550
1621
|
return ggml_backend_sycl_get_device_count();
|
@@ -1556,7 +1627,7 @@ static size_t llama_get_device_count() {
|
|
1556
1627
|
}
|
1557
1628
|
|
1558
1629
|
static size_t llama_get_device_memory(int device) {
|
1559
|
-
#if defined(
|
1630
|
+
#if defined(GGML_USE_CUDA)
|
1560
1631
|
size_t total;
|
1561
1632
|
size_t free;
|
1562
1633
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
@@ -1621,6 +1692,7 @@ enum e_model {
|
|
1621
1692
|
MODEL_40B,
|
1622
1693
|
MODEL_65B,
|
1623
1694
|
MODEL_70B,
|
1695
|
+
MODEL_314B,
|
1624
1696
|
MODEL_SMALL,
|
1625
1697
|
MODEL_MEDIUM,
|
1626
1698
|
MODEL_LARGE,
|
@@ -1738,6 +1810,7 @@ struct llama_cparams {
|
|
1738
1810
|
uint32_t n_ctx; // context size used during inference
|
1739
1811
|
uint32_t n_batch;
|
1740
1812
|
uint32_t n_ubatch;
|
1813
|
+
uint32_t n_seq_max;
|
1741
1814
|
uint32_t n_threads; // number of threads to use for generation
|
1742
1815
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1743
1816
|
|
@@ -1803,9 +1876,9 @@ struct llama_layer {
|
|
1803
1876
|
|
1804
1877
|
// ff MoE
|
1805
1878
|
struct ggml_tensor * ffn_gate_inp;
|
1806
|
-
struct ggml_tensor *
|
1807
|
-
struct ggml_tensor *
|
1808
|
-
struct ggml_tensor *
|
1879
|
+
struct ggml_tensor * ffn_gate_exps;
|
1880
|
+
struct ggml_tensor * ffn_down_exps;
|
1881
|
+
struct ggml_tensor * ffn_up_exps ;
|
1809
1882
|
|
1810
1883
|
// ff bias
|
1811
1884
|
struct ggml_tensor * ffn_down_b; // b2
|
@@ -2023,12 +2096,12 @@ struct llama_model {
|
|
2023
2096
|
// the model memory buffers for the tensor data
|
2024
2097
|
std::vector<ggml_backend_buffer_t> bufs;
|
2025
2098
|
|
2026
|
-
// model memory mapped
|
2027
|
-
|
2099
|
+
// model memory mapped files
|
2100
|
+
llama_mmaps mappings;
|
2028
2101
|
|
2029
2102
|
// objects representing data potentially being locked in memory
|
2030
|
-
|
2031
|
-
|
2103
|
+
llama_mlocks mlock_bufs;
|
2104
|
+
llama_mlocks mlock_mmaps;
|
2032
2105
|
|
2033
2106
|
// for quantize-stats only
|
2034
2107
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
@@ -2041,7 +2114,7 @@ struct llama_model {
|
|
2041
2114
|
ggml_free(ctx);
|
2042
2115
|
}
|
2043
2116
|
for (ggml_backend_buffer_t buf : bufs) {
|
2044
|
-
#ifdef
|
2117
|
+
#ifdef GGML_USE_CUDA
|
2045
2118
|
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
2046
2119
|
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
2047
2120
|
}
|
@@ -2060,10 +2133,6 @@ struct llama_context {
|
|
2060
2133
|
ggml_backend_free(backend);
|
2061
2134
|
}
|
2062
2135
|
|
2063
|
-
#ifdef GGML_USE_VULKAN
|
2064
|
-
ggml_vk_free_cpu_assist();
|
2065
|
-
#endif
|
2066
|
-
|
2067
2136
|
ggml_backend_buffer_free(buf_output);
|
2068
2137
|
}
|
2069
2138
|
|
@@ -2100,20 +2169,20 @@ struct llama_context {
|
|
2100
2169
|
// host buffer for the model output (logits and embeddings)
|
2101
2170
|
ggml_backend_buffer_t buf_output = nullptr;
|
2102
2171
|
|
2103
|
-
// decode output (2-dimensional array: [
|
2104
|
-
size_t
|
2105
|
-
float * logits
|
2172
|
+
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
2173
|
+
size_t logits_size = 0; // capacity (of floats) for logits
|
2174
|
+
float * logits = nullptr;
|
2175
|
+
|
2176
|
+
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
|
+
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2106
2179
|
|
2107
|
-
#ifndef NDEBUG
|
2108
|
-
// guard against access to unset logits
|
2109
|
-
std::vector<bool> logits_valid;
|
2110
|
-
#endif
|
2111
2180
|
bool logits_all = false;
|
2112
2181
|
|
2113
|
-
// embeddings output (2-dimensional array: [
|
2182
|
+
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
2114
2183
|
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
2115
|
-
size_t
|
2116
|
-
float * embd
|
2184
|
+
size_t embd_size = 0; // capacity (of floats) for embeddings
|
2185
|
+
float * embd = nullptr;
|
2117
2186
|
|
2118
2187
|
// sequence embeddings output (map of [n_embd] vectors)
|
2119
2188
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
@@ -2130,14 +2199,15 @@ struct llama_context {
|
|
2130
2199
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
2131
2200
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
2132
2201
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
2202
|
+
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
2133
2203
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2134
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [
|
2204
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
2135
2205
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
2136
2206
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
2137
2207
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
2138
2208
|
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
2139
|
-
struct ggml_tensor * inp_s_mask; // F32 [1,
|
2140
|
-
struct ggml_tensor * inp_s_seq; // I32 [
|
2209
|
+
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
2210
|
+
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
2141
2211
|
|
2142
2212
|
// control vectors
|
2143
2213
|
struct llama_control_vector cvec;
|
@@ -2792,6 +2862,8 @@ namespace GGUFMeta {
|
|
2792
2862
|
};
|
2793
2863
|
}
|
2794
2864
|
|
2865
|
+
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
2866
|
+
|
2795
2867
|
struct llama_model_loader {
|
2796
2868
|
int n_kv = 0;
|
2797
2869
|
int n_tensors = 0;
|
@@ -2802,54 +2874,133 @@ struct llama_model_loader {
|
|
2802
2874
|
|
2803
2875
|
bool use_mmap = false;
|
2804
2876
|
|
2805
|
-
|
2877
|
+
llama_files files;
|
2806
2878
|
llama_ftype ftype;
|
2807
2879
|
llama_fver fver;
|
2808
2880
|
|
2809
|
-
|
2881
|
+
llama_mmaps mappings;
|
2882
|
+
|
2883
|
+
// Holds information on a model weight
|
2884
|
+
struct llama_tensor_weight {
|
2885
|
+
uint16_t idx; // source file index
|
2886
|
+
size_t offs; // tensor data offset in the original file
|
2887
|
+
|
2888
|
+
ggml_tensor * tensor;
|
2889
|
+
|
2890
|
+
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2891
|
+
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2892
|
+
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
2893
|
+
}
|
2894
|
+
};
|
2895
|
+
std::vector<llama_tensor_weight> weights;
|
2896
|
+
|
2810
2897
|
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
2811
2898
|
|
2812
|
-
struct gguf_context *
|
2813
|
-
|
2899
|
+
struct gguf_context * meta = NULL;
|
2900
|
+
std::vector<ggml_context *> contexts;
|
2814
2901
|
|
2815
2902
|
std::string arch_name;
|
2816
2903
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2817
2904
|
|
2818
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p)
|
2905
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
2819
2906
|
int trace = 0;
|
2820
2907
|
if (getenv("LLAMA_TRACE")) {
|
2821
2908
|
trace = atoi(getenv("LLAMA_TRACE"));
|
2822
2909
|
}
|
2823
2910
|
|
2824
|
-
struct gguf_init_params params = {
|
2825
|
-
/*.no_alloc = */ true,
|
2826
|
-
/*.ctx = */ &ctx_meta,
|
2827
|
-
};
|
2828
|
-
|
2829
2911
|
if (param_overrides_p != nullptr) {
|
2830
2912
|
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
2831
2913
|
kv_overrides.insert({std::string(p->key), *p});
|
2832
2914
|
}
|
2833
2915
|
}
|
2834
2916
|
|
2835
|
-
|
2836
|
-
|
2917
|
+
struct ggml_context * ctx = NULL;
|
2918
|
+
struct gguf_init_params params = {
|
2919
|
+
/*.no_alloc = */ true,
|
2920
|
+
/*.ctx = */ &ctx,
|
2921
|
+
};
|
2922
|
+
|
2923
|
+
meta = gguf_init_from_file(fname.c_str(), params);
|
2924
|
+
if (!meta) {
|
2837
2925
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
2838
2926
|
}
|
2839
2927
|
|
2840
2928
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
2841
2929
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
2842
2930
|
|
2843
|
-
|
2844
|
-
|
2931
|
+
// Save tensors data offset of the main file.
|
2932
|
+
// For subsidiary files, `meta` tensor data offset must not be used,
|
2933
|
+
// so we build a unified tensors index for weights.
|
2934
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2935
|
+
weights.emplace_back(0, cur->name, meta, cur);
|
2936
|
+
}
|
2937
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
2938
|
+
contexts.emplace_back(ctx);
|
2939
|
+
|
2940
|
+
uint16_t n_split = 0;
|
2941
|
+
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
2942
|
+
|
2943
|
+
// Load additional GGML contexts
|
2944
|
+
if (n_split > 1) {
|
2945
|
+
uint16_t idx = 0;
|
2946
|
+
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
2947
|
+
if (idx != 0) {
|
2948
|
+
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
2949
|
+
}
|
2950
|
+
|
2951
|
+
char split_prefix[PATH_MAX] = {0};
|
2952
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
|
2953
|
+
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
2954
|
+
}
|
2955
|
+
|
2956
|
+
if (trace > 0) {
|
2957
|
+
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
2958
|
+
}
|
2959
|
+
|
2960
|
+
char split_path[PATH_MAX] = {0};
|
2961
|
+
for (idx = 1; idx < n_split; idx++) {
|
2962
|
+
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
2963
|
+
|
2964
|
+
struct gguf_init_params split_params = {
|
2965
|
+
/*.no_alloc = */ true,
|
2966
|
+
/*.ctx = */ &ctx,
|
2967
|
+
};
|
2968
|
+
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
|
2969
|
+
if (!ctx_gguf) {
|
2970
|
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
2971
|
+
}
|
2972
|
+
|
2973
|
+
// Save tensors data offset info of the shard.
|
2974
|
+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
2975
|
+
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
2976
|
+
}
|
2977
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
2978
|
+
contexts.emplace_back(ctx);
|
2979
|
+
|
2980
|
+
gguf_free(ctx_gguf);
|
2981
|
+
}
|
2982
|
+
|
2983
|
+
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
2984
|
+
|
2985
|
+
// sanity check
|
2986
|
+
{
|
2987
|
+
const int n_tensors_loaded = (int) weights.size();
|
2988
|
+
if (n_tensors != n_tensors_loaded) {
|
2989
|
+
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
2990
|
+
}
|
2991
|
+
}
|
2992
|
+
|
2993
|
+
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
2994
|
+
}
|
2995
|
+
|
2996
|
+
n_kv = gguf_get_n_kv(meta);
|
2997
|
+
n_tensors = weights.size();
|
2845
2998
|
|
2846
|
-
fver = (enum llama_fver
|
2999
|
+
fver = (enum llama_fver) gguf_get_version(meta);
|
2847
3000
|
|
2848
|
-
for (
|
2849
|
-
|
2850
|
-
|
2851
|
-
n_elements += ggml_nelements(t);
|
2852
|
-
n_bytes += ggml_nbytes(t);
|
3001
|
+
for (auto & w : weights) {
|
3002
|
+
n_elements += ggml_nelements(w.tensor);
|
3003
|
+
n_bytes += ggml_nbytes(w.tensor);
|
2853
3004
|
}
|
2854
3005
|
|
2855
3006
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -2864,7 +3015,8 @@ struct llama_model_loader {
|
|
2864
3015
|
enum ggml_type type_max = GGML_TYPE_F32;
|
2865
3016
|
|
2866
3017
|
for (int i = 0; i < n_tensors; i++) {
|
2867
|
-
|
3018
|
+
const ggml_tensor * tensor = weights.at(i).tensor;
|
3019
|
+
enum ggml_type type = tensor->type;
|
2868
3020
|
|
2869
3021
|
n_type[type]++;
|
2870
3022
|
|
@@ -2874,8 +3026,8 @@ struct llama_model_loader {
|
|
2874
3026
|
}
|
2875
3027
|
|
2876
3028
|
if (trace > 0) {
|
2877
|
-
|
2878
|
-
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(
|
3029
|
+
const uint16_t sid = weights.at(i).idx;
|
3030
|
+
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
2879
3031
|
}
|
2880
3032
|
}
|
2881
3033
|
|
@@ -2897,6 +3049,7 @@ struct llama_model_loader {
|
|
2897
3049
|
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2898
3050
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2899
3051
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
3052
|
+
case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
|
2900
3053
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2901
3054
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2902
3055
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
@@ -2911,22 +3064,23 @@ struct llama_model_loader {
|
|
2911
3064
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
2912
3065
|
|
2913
3066
|
{
|
2914
|
-
const int kid = gguf_find_key(
|
3067
|
+
const int kid = gguf_find_key(meta, "general.file_type");
|
2915
3068
|
if (kid >= 0) {
|
2916
|
-
ftype = (llama_ftype) gguf_get_val_u32(
|
3069
|
+
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
2917
3070
|
}
|
2918
3071
|
}
|
2919
3072
|
|
2920
3073
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
3074
|
+
|
2921
3075
|
for (int i = 0; i < n_kv; i++) {
|
2922
|
-
const char * name = gguf_get_key(
|
2923
|
-
const enum gguf_type type = gguf_get_kv_type(
|
3076
|
+
const char * name = gguf_get_key(meta, i);
|
3077
|
+
const enum gguf_type type = gguf_get_kv_type(meta, i);
|
2924
3078
|
const std::string type_name =
|
2925
3079
|
type == GGUF_TYPE_ARRAY
|
2926
|
-
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(
|
3080
|
+
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
|
2927
3081
|
: gguf_type_name(type);
|
2928
3082
|
|
2929
|
-
std::string value = gguf_kv_to_str(
|
3083
|
+
std::string value = gguf_kv_to_str(meta, i);
|
2930
3084
|
const size_t MAX_VALUE_LEN = 40;
|
2931
3085
|
if (value.size() > MAX_VALUE_LEN) {
|
2932
3086
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
@@ -2955,18 +3109,18 @@ struct llama_model_loader {
|
|
2955
3109
|
}
|
2956
3110
|
|
2957
3111
|
~llama_model_loader() {
|
2958
|
-
if (
|
2959
|
-
gguf_free(
|
3112
|
+
if (meta) {
|
3113
|
+
gguf_free(meta);
|
2960
3114
|
}
|
2961
|
-
|
2962
|
-
ggml_free(
|
3115
|
+
for (auto * ctx : contexts) {
|
3116
|
+
ggml_free(ctx);
|
2963
3117
|
}
|
2964
3118
|
}
|
2965
3119
|
|
2966
3120
|
template<typename T>
|
2967
3121
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2968
3122
|
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2969
|
-
const int kid = gguf_find_key(
|
3123
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
2970
3124
|
|
2971
3125
|
if (kid < 0) {
|
2972
3126
|
if (required) {
|
@@ -2976,7 +3130,7 @@ struct llama_model_loader {
|
|
2976
3130
|
}
|
2977
3131
|
|
2978
3132
|
struct GGUFMeta::ArrayInfo arr_info =
|
2979
|
-
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(
|
3133
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
2980
3134
|
|
2981
3135
|
|
2982
3136
|
result = arr_info.length;
|
@@ -2996,7 +3150,7 @@ struct llama_model_loader {
|
|
2996
3150
|
const struct llama_model_kv_override * override =
|
2997
3151
|
it != kv_overrides.end() ? &it->second : nullptr;
|
2998
3152
|
|
2999
|
-
const bool found = GGUFMeta::GKV<T>::set(
|
3153
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
3000
3154
|
|
3001
3155
|
if (required && !found) {
|
3002
3156
|
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
@@ -3019,28 +3173,57 @@ struct llama_model_loader {
|
|
3019
3173
|
}
|
3020
3174
|
|
3021
3175
|
const char * get_tensor_name(int i) const {
|
3022
|
-
return
|
3176
|
+
return weights.at(i).tensor->name;
|
3177
|
+
}
|
3178
|
+
|
3179
|
+
const llama_tensor_weight * get_weight(const char * name) const {
|
3180
|
+
for (const auto & weight : weights) {
|
3181
|
+
if (strcmp(name, weight.tensor->name) == 0) {
|
3182
|
+
return &weight;
|
3183
|
+
}
|
3184
|
+
}
|
3185
|
+
return nullptr;
|
3186
|
+
}
|
3187
|
+
|
3188
|
+
const llama_tensor_weight & require_weight(const char * name) const {
|
3189
|
+
const llama_tensor_weight * weight = get_weight(name);
|
3190
|
+
if (!weight) {
|
3191
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3192
|
+
}
|
3193
|
+
return *weight;
|
3023
3194
|
}
|
3024
3195
|
|
3025
3196
|
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
3026
|
-
|
3197
|
+
const auto * weight = get_weight(name);
|
3198
|
+
if (!weight) {
|
3199
|
+
return nullptr;
|
3200
|
+
}
|
3201
|
+
return weight->tensor;
|
3202
|
+
}
|
3203
|
+
|
3204
|
+
struct ggml_tensor * require_tensor_meta(const char * name) const {
|
3205
|
+
struct ggml_tensor * tensor = get_tensor_meta(name);
|
3206
|
+
if (!tensor) {
|
3207
|
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
3208
|
+
}
|
3209
|
+
return tensor;
|
3027
3210
|
}
|
3028
3211
|
|
3029
3212
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
3030
3213
|
return get_tensor_meta(get_tensor_name(i));
|
3031
3214
|
}
|
3032
3215
|
|
3033
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor *
|
3034
|
-
struct ggml_tensor * tensor = ggml_dup_tensor(ctx,
|
3035
|
-
ggml_set_name(tensor, ggml_get_name(
|
3216
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3217
|
+
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3218
|
+
ggml_set_name(tensor, ggml_get_name(cur));
|
3036
3219
|
|
3037
3220
|
n_created++;
|
3038
3221
|
|
3039
3222
|
return tensor;
|
3040
3223
|
}
|
3041
3224
|
|
3042
|
-
struct ggml_tensor *
|
3043
|
-
struct ggml_tensor * cur =
|
3225
|
+
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
|
3226
|
+
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
3044
3227
|
|
3045
3228
|
if (cur == NULL) {
|
3046
3229
|
if (!required) {
|
@@ -3051,8 +3234,8 @@ struct llama_model_loader {
|
|
3051
3234
|
|
3052
3235
|
{
|
3053
3236
|
bool is_ok = true;
|
3054
|
-
for (size_t i = 0; i <
|
3055
|
-
if (ne[i] != cur->ne[i]) {
|
3237
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3238
|
+
if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
|
3056
3239
|
is_ok = false;
|
3057
3240
|
break;
|
3058
3241
|
}
|
@@ -3066,127 +3249,196 @@ struct llama_model_loader {
|
|
3066
3249
|
}
|
3067
3250
|
}
|
3068
3251
|
|
3069
|
-
return
|
3252
|
+
return cur;
|
3070
3253
|
}
|
3071
3254
|
|
3072
|
-
|
3073
|
-
|
3074
|
-
|
3255
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
3256
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3257
|
+
|
3258
|
+
if (cur == NULL) {
|
3259
|
+
return NULL;
|
3075
3260
|
}
|
3261
|
+
|
3262
|
+
return create_tensor_for(ctx, cur);
|
3076
3263
|
}
|
3077
3264
|
|
3078
|
-
|
3079
|
-
const
|
3265
|
+
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
3266
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
3080
3267
|
|
3081
|
-
if (
|
3082
|
-
|
3268
|
+
if (cur == NULL) {
|
3269
|
+
return NULL;
|
3083
3270
|
}
|
3084
3271
|
|
3085
|
-
|
3086
|
-
|
3272
|
+
if (cur->type != base->type) {
|
3273
|
+
throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
|
3274
|
+
}
|
3087
3275
|
|
3088
|
-
|
3089
|
-
|
3090
|
-
|
3091
|
-
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
3276
|
+
std::array<int64_t, GGML_MAX_DIMS> dims;
|
3277
|
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
3278
|
+
dims[i] = i < ne.size() ? ne[i] : 1;
|
3092
3279
|
}
|
3093
3280
|
|
3094
|
-
|
3095
|
-
|
3096
|
-
|
3097
|
-
|
3281
|
+
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
3282
|
+
dims[0], dims[1], dims[2], dims[3],
|
3283
|
+
cur->nb[1], cur->nb[2], cur->nb[3],
|
3284
|
+
offset);
|
3285
|
+
|
3286
|
+
ggml_set_name(tensor, name.c_str());
|
3287
|
+
|
3288
|
+
n_created++;
|
3289
|
+
|
3290
|
+
return tensor;
|
3291
|
+
}
|
3292
|
+
|
3293
|
+
void done_getting_tensors() const {
|
3294
|
+
if (n_created != n_tensors) {
|
3295
|
+
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
3098
3296
|
}
|
3297
|
+
}
|
3099
3298
|
|
3100
|
-
|
3101
|
-
|
3102
|
-
|
3299
|
+
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
|
3300
|
+
if (use_mmap) {
|
3301
|
+
mappings.reserve(files.size());
|
3302
|
+
mmaps_used.reserve(files.size());
|
3303
|
+
for (const auto & file : files) {
|
3304
|
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
3305
|
+
mmaps_used.emplace_back(mapping->size, 0);
|
3306
|
+
if (mlock_mmaps) {
|
3307
|
+
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
3308
|
+
mlock_mmap->init(mapping->addr);
|
3309
|
+
mlock_mmaps->emplace_back(std::move(mlock_mmap));
|
3310
|
+
}
|
3311
|
+
mappings.emplace_back(std::move(mapping));
|
3103
3312
|
}
|
3104
|
-
|
3313
|
+
}
|
3314
|
+
|
3315
|
+
// compute the total size of all tensors for progress reporting
|
3316
|
+
for (auto & w : weights) {
|
3317
|
+
size_data += ggml_nbytes(w.tensor);
|
3105
3318
|
}
|
3106
3319
|
}
|
3107
3320
|
|
3108
|
-
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
3109
|
-
GGML_ASSERT(
|
3321
|
+
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
|
3322
|
+
GGML_ASSERT(!mappings.empty());
|
3323
|
+
const auto & mapping = mappings.at(idx);
|
3110
3324
|
|
3111
3325
|
*first = mapping->size;
|
3112
3326
|
*last = 0;
|
3327
|
+
*addr = mapping->addr;
|
3113
3328
|
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
3114
|
-
|
3115
|
-
|
3116
|
-
|
3329
|
+
try {
|
3330
|
+
const auto * weight = get_weight(ggml_get_name(tensor));
|
3331
|
+
if (!weight) {
|
3332
|
+
continue;
|
3333
|
+
}
|
3334
|
+
if (weight->idx != idx) {
|
3335
|
+
continue;
|
3336
|
+
}
|
3337
|
+
*first = std::min(*first, weight->offs);
|
3338
|
+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
3339
|
+
} catch(...) {
|
3340
|
+
// the tensor is not in the model
|
3341
|
+
}
|
3117
3342
|
}
|
3118
3343
|
}
|
3119
3344
|
|
3120
3345
|
// for backwards compatibility, does not support ggml-backend
|
3121
3346
|
void load_data_for(struct ggml_tensor * cur) const {
|
3122
|
-
const
|
3347
|
+
const auto & w = require_weight(ggml_get_name(cur));
|
3123
3348
|
|
3124
|
-
if (use_mmap
|
3349
|
+
if (use_mmap) {
|
3350
|
+
const auto & mapping = mappings.at(w.idx);
|
3125
3351
|
if (cur->data == nullptr) {
|
3126
|
-
cur->data = (uint8_t *)mapping->addr + offs;
|
3352
|
+
cur->data = (uint8_t *)mapping->addr + w.offs;
|
3127
3353
|
} else {
|
3128
|
-
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
3354
|
+
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
3129
3355
|
}
|
3130
3356
|
} else {
|
3131
3357
|
GGML_ASSERT(cur->data != nullptr);
|
3132
|
-
|
3133
|
-
file.
|
3358
|
+
GGML_ASSERT(w.idx < files.size());
|
3359
|
+
const auto & file = files.at(w.idx);
|
3360
|
+
file->seek(w.offs, SEEK_SET);
|
3361
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3134
3362
|
}
|
3135
3363
|
}
|
3136
3364
|
|
3137
3365
|
size_t size_done = 0;
|
3138
3366
|
size_t size_data = 0;
|
3139
|
-
size_t
|
3140
|
-
size_t mmap_used_last = 0;
|
3367
|
+
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
3141
3368
|
|
3142
3369
|
// Returns false if cancelled by progress_callback
|
3143
|
-
bool load_all_data(
|
3144
|
-
|
3370
|
+
bool load_all_data(
|
3371
|
+
struct ggml_context * ctx,
|
3372
|
+
llama_buf_map & bufs_mmap,
|
3373
|
+
llama_mlocks * lmlocks,
|
3374
|
+
llama_progress_callback progress_callback,
|
3375
|
+
void * progress_callback_user_data) {
|
3376
|
+
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3145
3377
|
|
3146
3378
|
std::vector<no_init<uint8_t>> read_buf;
|
3147
|
-
|
3148
3379
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3380
|
+
const auto * weight = get_weight(ggml_get_name(cur));
|
3381
|
+
if (weight == nullptr) {
|
3382
|
+
// this can happen with split experts models
|
3383
|
+
continue;
|
3384
|
+
}
|
3385
|
+
|
3149
3386
|
if (progress_callback) {
|
3150
3387
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
3151
3388
|
return false;
|
3152
3389
|
}
|
3153
3390
|
}
|
3154
3391
|
|
3155
|
-
|
3392
|
+
size_t n_size = ggml_nbytes(cur);
|
3156
3393
|
|
3157
|
-
if (use_mmap
|
3394
|
+
if (use_mmap) {
|
3395
|
+
const auto & mapping = mappings.at(weight->idx);
|
3396
|
+
ggml_backend_buffer_t buf_mmap = nullptr;
|
3397
|
+
if (bufs_mmap.count(weight->idx)) {
|
3398
|
+
buf_mmap = bufs_mmap.at(weight->idx);
|
3399
|
+
}
|
3400
|
+
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3158
3401
|
if (buf_mmap && cur->data == nullptr) {
|
3159
|
-
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
3160
|
-
if (
|
3161
|
-
lmlock
|
3402
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
|
3403
|
+
if (lmlocks) {
|
3404
|
+
const auto & lmlock = lmlocks->at(weight->idx);
|
3405
|
+
lmlock->grow_to(weight->offs + ggml_nbytes(cur));
|
3162
3406
|
}
|
3163
|
-
|
3164
|
-
|
3407
|
+
|
3408
|
+
auto & mmap_used = mmaps_used[weight->idx];
|
3409
|
+
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3410
|
+
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3165
3411
|
} else {
|
3166
|
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0,
|
3412
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
|
3167
3413
|
}
|
3168
3414
|
} else {
|
3415
|
+
GGML_ASSERT(weight->idx < files.size());
|
3416
|
+
const auto & file = files.at(weight->idx);
|
3169
3417
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3170
|
-
file
|
3171
|
-
file
|
3418
|
+
file->seek(weight->offs, SEEK_SET);
|
3419
|
+
file->read_raw(cur->data, ggml_nbytes(cur));
|
3172
3420
|
} else {
|
3173
3421
|
read_buf.resize(ggml_nbytes(cur));
|
3174
|
-
file
|
3175
|
-
file
|
3176
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0,
|
3422
|
+
file->seek(weight->offs, SEEK_SET);
|
3423
|
+
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
3424
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3177
3425
|
}
|
3178
3426
|
}
|
3179
3427
|
|
3180
|
-
size_done +=
|
3428
|
+
size_done += n_size;
|
3181
3429
|
}
|
3182
3430
|
|
3183
3431
|
// check if this is the last call and do final cleanup
|
3184
3432
|
if (size_done >= size_data) {
|
3185
3433
|
// unmap offloaded tensors and metadata
|
3186
|
-
if (use_mmap
|
3187
|
-
|
3188
|
-
|
3189
|
-
mapping
|
3434
|
+
if (use_mmap) {
|
3435
|
+
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
|
3436
|
+
const auto & mmap_used = mmaps_used.at(idx);
|
3437
|
+
auto & mapping = mappings.at(idx);
|
3438
|
+
mapping->unmap_fragment(0, mmap_used.first);
|
3439
|
+
if (mmap_used.second != 0) {
|
3440
|
+
mapping->unmap_fragment(mmap_used.second, mapping->size);
|
3441
|
+
}
|
3190
3442
|
}
|
3191
3443
|
}
|
3192
3444
|
if (progress_callback) {
|
@@ -3259,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3259
3511
|
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
3260
3512
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
3261
3513
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
3514
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
|
3262
3515
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
3263
3516
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
3264
3517
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
@@ -3290,6 +3543,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3290
3543
|
case MODEL_40B: return "40B";
|
3291
3544
|
case MODEL_65B: return "65B";
|
3292
3545
|
case MODEL_70B: return "70B";
|
3546
|
+
case MODEL_314B: return "314B";
|
3293
3547
|
case MODEL_SMALL: return "0.1B";
|
3294
3548
|
case MODEL_MEDIUM: return "0.4B";
|
3295
3549
|
case MODEL_LARGE: return "0.8B";
|
@@ -3319,7 +3573,7 @@ static void llm_load_hparams(
|
|
3319
3573
|
llama_model_loader & ml,
|
3320
3574
|
llama_model & model) {
|
3321
3575
|
auto & hparams = model.hparams;
|
3322
|
-
const gguf_context * ctx = ml.
|
3576
|
+
const gguf_context * ctx = ml.meta;
|
3323
3577
|
|
3324
3578
|
// get metadata as string
|
3325
3579
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -3428,6 +3682,15 @@ static void llm_load_hparams(
|
|
3428
3682
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3429
3683
|
}
|
3430
3684
|
} break;
|
3685
|
+
case LLM_ARCH_GROK:
|
3686
|
+
{
|
3687
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3688
|
+
|
3689
|
+
switch (hparams.n_layer) {
|
3690
|
+
case 64: model.type = e_model::MODEL_314B; break;
|
3691
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3692
|
+
}
|
3693
|
+
} break;
|
3431
3694
|
case LLM_ARCH_FALCON:
|
3432
3695
|
{
|
3433
3696
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3679,6 +3942,16 @@ static void llm_load_hparams(
|
|
3679
3942
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3680
3943
|
}
|
3681
3944
|
} break;
|
3945
|
+
case LLM_ARCH_XVERSE:
|
3946
|
+
{
|
3947
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3948
|
+
switch (hparams.n_layer) {
|
3949
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3950
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3951
|
+
case 80: model.type = e_model::MODEL_65B; break;
|
3952
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3953
|
+
}
|
3954
|
+
} break;
|
3682
3955
|
case LLM_ARCH_COMMAND_R:
|
3683
3956
|
{
|
3684
3957
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
@@ -3709,7 +3982,7 @@ static void llm_load_vocab(
|
|
3709
3982
|
llama_model & model) {
|
3710
3983
|
auto & vocab = model.vocab;
|
3711
3984
|
|
3712
|
-
struct gguf_context * ctx = ml.
|
3985
|
+
struct gguf_context * ctx = ml.meta;
|
3713
3986
|
|
3714
3987
|
const auto kv = LLM_KV(model.arch);
|
3715
3988
|
|
@@ -3842,7 +4115,7 @@ static void llm_load_vocab(
|
|
3842
4115
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3843
4116
|
vocab.linefeed_id = vocab.special_pad_id;
|
3844
4117
|
} else {
|
3845
|
-
const std::vector<int> ids = llama_tokenize_internal(vocab, "\
|
4118
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
3846
4119
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
3847
4120
|
vocab.linefeed_id = ids[0];
|
3848
4121
|
}
|
@@ -4075,6 +4348,7 @@ static bool llm_load_tensors(
|
|
4075
4348
|
|
4076
4349
|
const int64_t n_layer = hparams.n_layer;
|
4077
4350
|
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
4351
|
+
bool use_mmap_buffer = true;
|
4078
4352
|
|
4079
4353
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
4080
4354
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
@@ -4163,6 +4437,10 @@ static bool llm_load_tensors(
|
|
4163
4437
|
|
4164
4438
|
// create one context per buffer type
|
4165
4439
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4440
|
+
|
4441
|
+
// for moe merged tensors
|
4442
|
+
ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
|
4443
|
+
|
4166
4444
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4167
4445
|
for (auto & it : buft_layer_count) {
|
4168
4446
|
struct ggml_init_params params = {
|
@@ -4189,6 +4467,11 @@ static bool llm_load_tensors(
|
|
4189
4467
|
const int64_t n_vocab = hparams.n_vocab;
|
4190
4468
|
const int64_t n_vocab_type = hparams.n_vocab_type;
|
4191
4469
|
const int64_t n_ff = hparams.n_ff;
|
4470
|
+
const int64_t n_expert = hparams.n_expert;
|
4471
|
+
|
4472
|
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
4473
|
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
4474
|
+
}
|
4192
4475
|
|
4193
4476
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
4194
4477
|
|
@@ -4243,26 +4526,113 @@ static bool llm_load_tensors(
|
|
4243
4526
|
|
4244
4527
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4245
4528
|
|
4246
|
-
|
4247
|
-
|
4248
|
-
if (layer.ffn_gate_inp == nullptr) {
|
4249
|
-
GGML_ASSERT(hparams.n_expert == 0);
|
4250
|
-
GGML_ASSERT(hparams.n_expert_used == 0);
|
4251
|
-
|
4529
|
+
if (n_expert == 0) {
|
4252
4530
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4253
4531
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4254
4532
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4255
4533
|
} else {
|
4256
|
-
|
4257
|
-
|
4258
|
-
|
4259
|
-
|
4260
|
-
|
4261
|
-
layer.
|
4262
|
-
|
4263
|
-
|
4534
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4535
|
+
|
4536
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4537
|
+
if (layer.ffn_gate_exps) {
|
4538
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4539
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4540
|
+
} else {
|
4541
|
+
// merge split expert into a single tensor for compatibility with older models
|
4542
|
+
// requires disabling mmap
|
4543
|
+
use_mmap_buffer = false;
|
4544
|
+
|
4545
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4546
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4547
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4548
|
+
|
4549
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4550
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4551
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4552
|
+
|
4553
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4554
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4555
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4556
|
+
|
4557
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4558
|
+
// the individual experts are loaded into a view of the merged tensor
|
4559
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4560
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4561
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4562
|
+
}
|
4563
|
+
}
|
4564
|
+
}
|
4565
|
+
}
|
4566
|
+
} break;
|
4567
|
+
case LLM_ARCH_GROK:
|
4568
|
+
{
|
4569
|
+
if (n_expert == 0) {
|
4570
|
+
throw std::runtime_error("Grok model cannot have zero experts");
|
4571
|
+
}
|
4572
|
+
|
4573
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4574
|
+
|
4575
|
+
// output
|
4576
|
+
{
|
4577
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4578
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4579
|
+
// if output is NULL, init from the input tok embed
|
4580
|
+
if (model.output == NULL) {
|
4581
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4582
|
+
ml.n_created--; // artificial tensor
|
4583
|
+
ml.size_data += ggml_nbytes(model.output);
|
4584
|
+
}
|
4585
|
+
}
|
4586
|
+
|
4587
|
+
for (int i = 0; i < n_layer; ++i) {
|
4588
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4589
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4590
|
+
|
4591
|
+
auto & layer = model.layers[i];
|
4592
|
+
|
4593
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4594
|
+
|
4595
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4596
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4597
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4598
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4599
|
+
|
4600
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4601
|
+
|
4602
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4603
|
+
|
4604
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4605
|
+
|
4606
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
4607
|
+
if (layer.ffn_gate_exps) {
|
4608
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4609
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4610
|
+
} else {
|
4611
|
+
// merge split expert into a single tensor for compatibility with older models
|
4612
|
+
// requires disabling mmap
|
4613
|
+
use_mmap_buffer = false;
|
4614
|
+
|
4615
|
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
4616
|
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
4617
|
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
4618
|
+
|
4619
|
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
4620
|
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
4621
|
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
4622
|
+
|
4623
|
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
4624
|
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
4625
|
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
4626
|
+
|
4627
|
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
4628
|
+
// the individual experts are loaded into a view of the merged tensor
|
4629
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
4630
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
4631
|
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
4264
4632
|
}
|
4265
4633
|
}
|
4634
|
+
|
4635
|
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4266
4636
|
}
|
4267
4637
|
} break;
|
4268
4638
|
case LLM_ARCH_BAICHUAN:
|
@@ -4319,10 +4689,8 @@ static bool llm_load_tensors(
|
|
4319
4689
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4320
4690
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4321
4691
|
|
4322
|
-
|
4323
|
-
|
4324
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
4325
|
-
}
|
4692
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
|
4693
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
|
4326
4694
|
|
4327
4695
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4328
4696
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -4502,6 +4870,7 @@ static bool llm_load_tensors(
|
|
4502
4870
|
case LLM_ARCH_MPT:
|
4503
4871
|
{
|
4504
4872
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4873
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
|
4505
4874
|
|
4506
4875
|
// output
|
4507
4876
|
{
|
@@ -4540,6 +4909,12 @@ static bool llm_load_tensors(
|
|
4540
4909
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4541
4910
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
4542
4911
|
|
4912
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
4913
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
4914
|
+
|
4915
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
4916
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
4917
|
+
|
4543
4918
|
// AWQ ScaleActivation layer
|
4544
4919
|
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
4545
4920
|
}
|
@@ -4986,6 +5361,28 @@ static bool llm_load_tensors(
|
|
4986
5361
|
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
4987
5362
|
}
|
4988
5363
|
} break;
|
5364
|
+
case LLM_ARCH_XVERSE:
|
5365
|
+
{
|
5366
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5367
|
+
{
|
5368
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5369
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5370
|
+
}
|
5371
|
+
for (int i = 0; i < n_layer; ++i) {
|
5372
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5373
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5374
|
+
auto & layer = model.layers[i];
|
5375
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5376
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5377
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5378
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5379
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5380
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5381
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5382
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5383
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5384
|
+
}
|
5385
|
+
} break;
|
4989
5386
|
case LLM_ARCH_COMMAND_R:
|
4990
5387
|
{
|
4991
5388
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5024,56 +5421,97 @@ static bool llm_load_tensors(
|
|
5024
5421
|
|
5025
5422
|
ml.done_getting_tensors();
|
5026
5423
|
|
5027
|
-
ml.
|
5424
|
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
5425
|
+
model.mappings.reserve(ml.mappings.size());
|
5028
5426
|
|
5029
5427
|
// create the backend buffers
|
5030
|
-
std::vector<std::pair<ggml_context *,
|
5428
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
5429
|
+
ctx_bufs.reserve(ctx_map.size());
|
5430
|
+
|
5431
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
5432
|
+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
5433
|
+
model.bufs.reserve(n_max_backend_buffer);
|
5031
5434
|
|
5032
5435
|
for (auto & it : ctx_map) {
|
5033
5436
|
ggml_backend_buffer_type_t buft = it.first;
|
5034
|
-
ggml_context * ctx
|
5035
|
-
|
5437
|
+
ggml_context * ctx = it.second;
|
5438
|
+
|
5439
|
+
llama_buf_map bufs;
|
5440
|
+
bufs.reserve(n_max_backend_buffer);
|
5036
5441
|
|
5037
5442
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
5038
5443
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
5039
5444
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
5040
|
-
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
5041
|
-
|
5042
|
-
|
5043
|
-
|
5044
|
-
|
5045
|
-
|
5046
|
-
|
5445
|
+
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
5446
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5447
|
+
void * addr = nullptr;
|
5448
|
+
size_t first, last;
|
5449
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5450
|
+
if (first >= last) {
|
5451
|
+
continue;
|
5452
|
+
}
|
5453
|
+
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
|
5454
|
+
if (buf == nullptr) {
|
5455
|
+
throw std::runtime_error("unable to allocate backend CPU buffer");
|
5456
|
+
}
|
5457
|
+
model.bufs.push_back(buf);
|
5458
|
+
bufs.emplace(idx, buf);
|
5459
|
+
#ifdef GGML_USE_CUDA
|
5460
|
+
if (n_layer >= n_gpu_layers) {
|
5461
|
+
ggml_backend_cuda_register_host_buffer(
|
5047
5462
|
ggml_backend_buffer_get_base(buf),
|
5048
5463
|
ggml_backend_buffer_get_size(buf));
|
5049
|
-
|
5464
|
+
}
|
5050
5465
|
#endif
|
5466
|
+
}
|
5051
5467
|
}
|
5052
5468
|
#ifdef GGML_USE_METAL
|
5053
|
-
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
5054
|
-
|
5055
|
-
|
5056
|
-
|
5057
|
-
|
5469
|
+
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
5470
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5471
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
5472
|
+
void * addr = nullptr;
|
5473
|
+
size_t first, last;
|
5474
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
5475
|
+
if (first >= last) {
|
5476
|
+
continue;
|
5477
|
+
}
|
5478
|
+
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
|
5479
|
+
if (buf == nullptr) {
|
5480
|
+
throw std::runtime_error("unable to allocate backend metal buffer");
|
5481
|
+
}
|
5482
|
+
model.bufs.push_back(buf);
|
5483
|
+
bufs.emplace(idx, buf);
|
5484
|
+
}
|
5058
5485
|
}
|
5059
5486
|
#endif
|
5060
5487
|
else {
|
5061
|
-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5062
|
-
if (buf
|
5488
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
5489
|
+
if (buf == nullptr) {
|
5490
|
+
throw std::runtime_error("unable to allocate backend buffer");
|
5491
|
+
}
|
5492
|
+
model.bufs.push_back(buf);
|
5493
|
+
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
5063
5494
|
model.mlock_bufs.emplace_back(new llama_mlock);
|
5064
5495
|
auto & mlock_buf = model.mlock_bufs.back();
|
5065
5496
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
5066
5497
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
5067
5498
|
}
|
5499
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
5500
|
+
bufs.emplace(idx, buf);
|
5501
|
+
}
|
5068
5502
|
}
|
5069
|
-
|
5503
|
+
|
5504
|
+
if (bufs.empty()) {
|
5070
5505
|
throw std::runtime_error("failed to allocate buffer");
|
5071
5506
|
}
|
5072
|
-
|
5073
|
-
|
5074
|
-
|
5075
|
-
|
5076
|
-
|
5507
|
+
|
5508
|
+
for (auto & buf : bufs) {
|
5509
|
+
// indicate that this buffer contains weights
|
5510
|
+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
5511
|
+
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
5512
|
+
}
|
5513
|
+
|
5514
|
+
ctx_bufs.emplace_back(ctx, bufs);
|
5077
5515
|
}
|
5078
5516
|
|
5079
5517
|
if (llama_supports_gpu_offload()) {
|
@@ -5105,13 +5543,17 @@ static bool llm_load_tensors(
|
|
5105
5543
|
// load tensor data
|
5106
5544
|
for (auto & it : ctx_bufs) {
|
5107
5545
|
ggml_context * ctx = it.first;
|
5108
|
-
|
5109
|
-
if (!ml.load_all_data(ctx,
|
5546
|
+
auto & bufs = it.second;
|
5547
|
+
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
5110
5548
|
return false;
|
5111
5549
|
}
|
5112
5550
|
}
|
5113
5551
|
|
5114
|
-
|
5552
|
+
if (use_mmap_buffer) {
|
5553
|
+
for (auto & mapping : ml.mappings) {
|
5554
|
+
model.mappings.emplace_back(std::move(mapping));
|
5555
|
+
}
|
5556
|
+
}
|
5115
5557
|
|
5116
5558
|
// loading time will be recalculate after the first eval, so
|
5117
5559
|
// we take page faults deferred by mmap() into consideration
|
@@ -5266,8 +5708,8 @@ static void llm_build_kv_store(
|
|
5266
5708
|
GGML_ASSERT(kv.size == n_ctx);
|
5267
5709
|
|
5268
5710
|
// compute the transposed [n_tokens, n_embd] V matrix
|
5269
|
-
|
5270
|
-
|
5711
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
5712
|
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
5271
5713
|
cb(v_cur_t, "v_cur_t", il);
|
5272
5714
|
|
5273
5715
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
@@ -5451,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
5451
5893
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5452
5894
|
}
|
5453
5895
|
|
5896
|
+
if (model.arch == LLM_ARCH_GROK) {
|
5897
|
+
// need to do the following:
|
5898
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
5899
|
+
// and then :
|
5900
|
+
// kq = 30 * tanh(kq / 30)
|
5901
|
+
// before the softmax below
|
5902
|
+
|
5903
|
+
//try from phi2
|
5904
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
5905
|
+
|
5906
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
5907
|
+
kq = ggml_scale(ctx, kq, 30);
|
5908
|
+
}
|
5909
|
+
|
5454
5910
|
#if defined(GGML_USE_KOMPUTE)
|
5455
5911
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
5456
5912
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
@@ -5577,7 +6033,8 @@ struct llm_build_context {
|
|
5577
6033
|
const float norm_rms_eps;
|
5578
6034
|
|
5579
6035
|
const int32_t n_tokens;
|
5580
|
-
const int32_t n_kv; // size of KV cache to consider (n_kv <=
|
6036
|
+
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
6037
|
+
const int32_t n_outputs;
|
5581
6038
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5582
6039
|
const int32_t n_orig_ctx;
|
5583
6040
|
|
@@ -5624,6 +6081,7 @@ struct llm_build_context {
|
|
5624
6081
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
5625
6082
|
n_tokens (batch.n_tokens),
|
5626
6083
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
6084
|
+
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
5627
6085
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
5628
6086
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5629
6087
|
pooling_type (cparams.pooling_type),
|
@@ -5645,6 +6103,7 @@ struct llm_build_context {
|
|
5645
6103
|
lctx.inp_tokens = nullptr;
|
5646
6104
|
lctx.inp_embd = nullptr;
|
5647
6105
|
lctx.inp_pos = nullptr;
|
6106
|
+
lctx.inp_out_ids = nullptr;
|
5648
6107
|
lctx.inp_KQ_mask = nullptr;
|
5649
6108
|
lctx.inp_KQ_pos = nullptr;
|
5650
6109
|
lctx.inp_K_shift = nullptr;
|
@@ -5768,6 +6227,13 @@ struct llm_build_context {
|
|
5768
6227
|
return lctx.inp_pos;
|
5769
6228
|
}
|
5770
6229
|
|
6230
|
+
struct ggml_tensor * build_inp_out_ids() {
|
6231
|
+
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
6232
|
+
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
6233
|
+
ggml_set_input(lctx.inp_out_ids);
|
6234
|
+
return lctx.inp_out_ids;
|
6235
|
+
}
|
6236
|
+
|
5771
6237
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
5772
6238
|
if (causal) {
|
5773
6239
|
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
|
@@ -5824,6 +6290,9 @@ struct llm_build_context {
|
|
5824
6290
|
struct ggml_cgraph * build_llama() {
|
5825
6291
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5826
6292
|
|
6293
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6294
|
+
int32_t n_tokens = this->n_tokens;
|
6295
|
+
|
5827
6296
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5828
6297
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5829
6298
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
@@ -5891,6 +6360,14 @@ struct llm_build_context {
|
|
5891
6360
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5892
6361
|
}
|
5893
6362
|
|
6363
|
+
if (il == n_layer - 1) {
|
6364
|
+
// skip computing output for unused tokens
|
6365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6366
|
+
n_tokens = n_outputs;
|
6367
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6368
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6369
|
+
}
|
6370
|
+
|
5894
6371
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5895
6372
|
cb(ffn_inp, "ffn_inp", il);
|
5896
6373
|
|
@@ -5943,19 +6420,19 @@ struct llm_build_context {
|
|
5943
6420
|
for (int i = 0; i < n_expert_used; ++i) {
|
5944
6421
|
ggml_tensor * cur_expert;
|
5945
6422
|
|
5946
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].
|
6423
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
5947
6424
|
cb(cur_up, "ffn_moe_up", il);
|
5948
6425
|
|
5949
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].
|
6426
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
5950
6427
|
cb(cur_gate, "ffn_moe_gate", il);
|
5951
6428
|
|
5952
6429
|
cur_gate = ggml_silu(ctx0, cur_gate);
|
5953
6430
|
cb(cur_gate, "ffn_moe_silu", il);
|
5954
6431
|
|
5955
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6432
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
5956
6433
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
5957
6434
|
|
5958
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].
|
6435
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
5959
6436
|
cb(cur_expert, "ffn_moe_down", il);
|
5960
6437
|
|
5961
6438
|
cur_expert = ggml_mul(ctx0, cur_expert,
|
@@ -6070,6 +6547,13 @@ struct llm_build_context {
|
|
6070
6547
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6071
6548
|
}
|
6072
6549
|
|
6550
|
+
if (il == n_layer - 1) {
|
6551
|
+
// skip computing output for unused tokens
|
6552
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6553
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6554
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6555
|
+
}
|
6556
|
+
|
6073
6557
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6074
6558
|
cb(ffn_inp, "ffn_inp", il);
|
6075
6559
|
|
@@ -6112,6 +6596,111 @@ struct llm_build_context {
|
|
6112
6596
|
return gf;
|
6113
6597
|
}
|
6114
6598
|
|
6599
|
+
struct ggml_cgraph * build_xverse() {
|
6600
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6601
|
+
|
6602
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6603
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6604
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6605
|
+
|
6606
|
+
struct ggml_tensor * cur;
|
6607
|
+
struct ggml_tensor * inpL;
|
6608
|
+
|
6609
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6610
|
+
|
6611
|
+
// inp_pos - contains the positions
|
6612
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6613
|
+
|
6614
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6615
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6616
|
+
|
6617
|
+
// positions of the tokens in the KV cache
|
6618
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6619
|
+
|
6620
|
+
for (int il = 0; il < n_layer; ++il) {
|
6621
|
+
struct ggml_tensor * inpSA = inpL;
|
6622
|
+
|
6623
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6624
|
+
model.layers[il].attn_norm, NULL,
|
6625
|
+
LLM_NORM_RMS, cb, il);
|
6626
|
+
cb(cur, "attn_norm", il);
|
6627
|
+
|
6628
|
+
// self-attention
|
6629
|
+
{
|
6630
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6631
|
+
cb(Qcur, "Qcur", il);
|
6632
|
+
|
6633
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6634
|
+
cb(Kcur, "Kcur", il);
|
6635
|
+
|
6636
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6637
|
+
cb(Vcur, "Vcur", il);
|
6638
|
+
|
6639
|
+
Qcur = ggml_rope_custom(
|
6640
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6641
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6642
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6643
|
+
);
|
6644
|
+
cb(Qcur, "Qcur", il);
|
6645
|
+
|
6646
|
+
Kcur = ggml_rope_custom(
|
6647
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6648
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6649
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6650
|
+
);
|
6651
|
+
cb(Kcur, "Kcur", il);
|
6652
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6653
|
+
model.layers[il].wo, NULL,
|
6654
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6655
|
+
}
|
6656
|
+
|
6657
|
+
if (il == n_layer - 1) {
|
6658
|
+
// skip computing output for unused tokens
|
6659
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6660
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6661
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6662
|
+
}
|
6663
|
+
|
6664
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6665
|
+
cb(ffn_inp, "ffn_inp", il);
|
6666
|
+
|
6667
|
+
// feed-forward network
|
6668
|
+
{
|
6669
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6670
|
+
model.layers[il].ffn_norm, NULL,
|
6671
|
+
LLM_NORM_RMS, cb, il);
|
6672
|
+
cb(cur, "ffn_norm", il);
|
6673
|
+
|
6674
|
+
cur = llm_build_ffn(ctx0, cur,
|
6675
|
+
model.layers[il].ffn_up, NULL,
|
6676
|
+
model.layers[il].ffn_gate, NULL,
|
6677
|
+
model.layers[il].ffn_down, NULL,
|
6678
|
+
NULL,
|
6679
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6680
|
+
cb(cur, "ffn_out", il);
|
6681
|
+
}
|
6682
|
+
|
6683
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6684
|
+
cb(cur, "l_out", il);
|
6685
|
+
|
6686
|
+
// input for next layer
|
6687
|
+
inpL = cur;
|
6688
|
+
}
|
6689
|
+
|
6690
|
+
cur = inpL;
|
6691
|
+
|
6692
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
6693
|
+
cb(cur, "result_norm", -1);
|
6694
|
+
|
6695
|
+
// lm_head
|
6696
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6697
|
+
cb(cur, "result_output", -1);
|
6698
|
+
|
6699
|
+
ggml_build_forward_expand(gf, cur);
|
6700
|
+
|
6701
|
+
return gf;
|
6702
|
+
}
|
6703
|
+
|
6115
6704
|
struct ggml_cgraph * build_falcon() {
|
6116
6705
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6117
6706
|
|
@@ -6185,6 +6774,14 @@ struct llm_build_context {
|
|
6185
6774
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6186
6775
|
}
|
6187
6776
|
|
6777
|
+
if (il == n_layer - 1) {
|
6778
|
+
// skip computing output for unused tokens
|
6779
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6780
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6781
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6782
|
+
attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
|
6783
|
+
}
|
6784
|
+
|
6188
6785
|
struct ggml_tensor * ffn_inp = cur;
|
6189
6786
|
|
6190
6787
|
// feed forward
|
@@ -6225,6 +6822,214 @@ struct llm_build_context {
|
|
6225
6822
|
return gf;
|
6226
6823
|
}
|
6227
6824
|
|
6825
|
+
struct ggml_cgraph * build_grok() {
|
6826
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6827
|
+
|
6828
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
6829
|
+
int32_t n_tokens = this->n_tokens;
|
6830
|
+
|
6831
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6832
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6833
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6834
|
+
|
6835
|
+
struct ggml_tensor * cur;
|
6836
|
+
struct ggml_tensor * inpL;
|
6837
|
+
|
6838
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6839
|
+
|
6840
|
+
// multiply by embedding_multiplier_scale of 78.38367176906169
|
6841
|
+
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
|
6842
|
+
|
6843
|
+
// inp_pos - contains the positions
|
6844
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6845
|
+
|
6846
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6847
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6848
|
+
|
6849
|
+
for (int il = 0; il < n_layer; ++il) {
|
6850
|
+
struct ggml_tensor * inpSA = inpL;
|
6851
|
+
|
6852
|
+
// norm
|
6853
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6854
|
+
model.layers[il].attn_norm, NULL,
|
6855
|
+
LLM_NORM_RMS, cb, il);
|
6856
|
+
cb(cur, "attn_norm", il);
|
6857
|
+
|
6858
|
+
|
6859
|
+
// self-attention
|
6860
|
+
{
|
6861
|
+
// compute Q and K and RoPE them
|
6862
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6863
|
+
cb(Qcur, "Qcur", il);
|
6864
|
+
if (model.layers[il].bq) {
|
6865
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6866
|
+
cb(Qcur, "Qcur", il);
|
6867
|
+
}
|
6868
|
+
|
6869
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6870
|
+
cb(Kcur, "Kcur", il);
|
6871
|
+
if (model.layers[il].bk) {
|
6872
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6873
|
+
cb(Kcur, "Kcur", il);
|
6874
|
+
}
|
6875
|
+
|
6876
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6877
|
+
cb(Vcur, "Vcur", il);
|
6878
|
+
if (model.layers[il].bv) {
|
6879
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6880
|
+
cb(Vcur, "Vcur", il);
|
6881
|
+
}
|
6882
|
+
|
6883
|
+
Qcur = ggml_rope_custom(
|
6884
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6885
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6886
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6887
|
+
);
|
6888
|
+
cb(Qcur, "Qcur", il);
|
6889
|
+
|
6890
|
+
Kcur = ggml_rope_custom(
|
6891
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6892
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6893
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6894
|
+
);
|
6895
|
+
cb(Kcur, "Kcur", il);
|
6896
|
+
|
6897
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6898
|
+
model.layers[il].wo, model.layers[il].bo,
|
6899
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6900
|
+
}
|
6901
|
+
|
6902
|
+
if (il == n_layer - 1) {
|
6903
|
+
// skip computing output for unused tokens
|
6904
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
6905
|
+
n_tokens = n_outputs;
|
6906
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
6907
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
6908
|
+
}
|
6909
|
+
|
6910
|
+
// Grok
|
6911
|
+
// if attn_out_norm is present then apply it before adding the input
|
6912
|
+
if (model.layers[il].attn_out_norm) {
|
6913
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6914
|
+
model.layers[il].attn_out_norm, NULL,
|
6915
|
+
LLM_NORM_RMS, cb, il);
|
6916
|
+
cb(cur, "attn_out_norm", il);
|
6917
|
+
}
|
6918
|
+
|
6919
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6920
|
+
cb(ffn_inp, "ffn_inp", il);
|
6921
|
+
|
6922
|
+
// feed-forward network
|
6923
|
+
// MoE branch
|
6924
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6925
|
+
model.layers[il].ffn_norm, NULL,
|
6926
|
+
LLM_NORM_RMS, cb, il);
|
6927
|
+
cb(cur, "ffn_norm", il);
|
6928
|
+
|
6929
|
+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
6930
|
+
cb(logits, "ffn_moe_logits", il);
|
6931
|
+
|
6932
|
+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
6933
|
+
cb(probs, "ffn_moe_probs", il);
|
6934
|
+
|
6935
|
+
// select experts
|
6936
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
6937
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6938
|
+
|
6939
|
+
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6940
|
+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6941
|
+
cb(weights, "ffn_moe_weights", il);
|
6942
|
+
|
6943
|
+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6944
|
+
|
6945
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6946
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6947
|
+
|
6948
|
+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6949
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6950
|
+
|
6951
|
+
// compute expert outputs
|
6952
|
+
ggml_tensor * moe_out = nullptr;
|
6953
|
+
|
6954
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6955
|
+
ggml_tensor * cur_expert;
|
6956
|
+
|
6957
|
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6958
|
+
cb(cur_up, "ffn_moe_up", il);
|
6959
|
+
|
6960
|
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6961
|
+
cb(cur_gate, "ffn_moe_gate", il);
|
6962
|
+
|
6963
|
+
//GeLU
|
6964
|
+
cur_gate = ggml_gelu(ctx0, cur_gate);
|
6965
|
+
cb(cur_gate, "ffn_moe_gelu", il);
|
6966
|
+
|
6967
|
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6968
|
+
cb(cur_expert, "ffn_moe_gate_par", il);
|
6969
|
+
|
6970
|
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6971
|
+
cb(cur_expert, "ffn_moe_down", il);
|
6972
|
+
|
6973
|
+
cur_expert = ggml_mul(ctx0, cur_expert,
|
6974
|
+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6975
|
+
cb(cur_expert, "ffn_moe_weighted", il);
|
6976
|
+
|
6977
|
+
if (i == 0) {
|
6978
|
+
moe_out = cur_expert;
|
6979
|
+
} else {
|
6980
|
+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6981
|
+
cb(moe_out, "ffn_moe_out", il);
|
6982
|
+
}
|
6983
|
+
}
|
6984
|
+
|
6985
|
+
cur = moe_out;
|
6986
|
+
|
6987
|
+
// Grok
|
6988
|
+
// if layer_out_norm is present then apply it before adding the input
|
6989
|
+
// Idea: maybe ffn_out_norm is a better name
|
6990
|
+
if (model.layers[il].layer_out_norm) {
|
6991
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6992
|
+
model.layers[il].layer_out_norm, NULL,
|
6993
|
+
LLM_NORM_RMS, cb, il);
|
6994
|
+
cb(cur, "layer_out_norm", il);
|
6995
|
+
}
|
6996
|
+
|
6997
|
+
|
6998
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6999
|
+
cb(cur, "ffn_out", il);
|
7000
|
+
|
7001
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7002
|
+
if (layer_dir != nullptr) {
|
7003
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7004
|
+
}
|
7005
|
+
cb(cur, "l_out", il);
|
7006
|
+
|
7007
|
+
// input for next layer
|
7008
|
+
inpL = cur;
|
7009
|
+
}
|
7010
|
+
|
7011
|
+
cur = inpL;
|
7012
|
+
|
7013
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7014
|
+
model.output_norm, NULL,
|
7015
|
+
LLM_NORM_RMS, cb, -1);
|
7016
|
+
cb(cur, "result_norm", -1);
|
7017
|
+
|
7018
|
+
// lm_head
|
7019
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7020
|
+
|
7021
|
+
// Grok
|
7022
|
+
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
7023
|
+
|
7024
|
+
cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
|
7025
|
+
|
7026
|
+
cb(cur, "result_output", -1);
|
7027
|
+
|
7028
|
+
ggml_build_forward_expand(gf, cur);
|
7029
|
+
|
7030
|
+
return gf;
|
7031
|
+
}
|
7032
|
+
|
6228
7033
|
struct ggml_cgraph * build_starcoder() {
|
6229
7034
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6230
7035
|
|
@@ -6279,6 +7084,13 @@ struct llm_build_context {
|
|
6279
7084
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6280
7085
|
}
|
6281
7086
|
|
7087
|
+
if (il == n_layer - 1) {
|
7088
|
+
// skip computing output for unused tokens
|
7089
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7090
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7091
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7092
|
+
}
|
7093
|
+
|
6282
7094
|
// add the input
|
6283
7095
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6284
7096
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6476,6 +7288,13 @@ struct llm_build_context {
|
|
6476
7288
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6477
7289
|
}
|
6478
7290
|
|
7291
|
+
if (il == n_layer - 1) {
|
7292
|
+
// skip computing output for unused tokens
|
7293
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7294
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7295
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
7296
|
+
}
|
7297
|
+
|
6479
7298
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
6480
7299
|
cb(ffn_inp, "ffn_inp", il);
|
6481
7300
|
|
@@ -6565,6 +7384,13 @@ struct llm_build_context {
|
|
6565
7384
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6566
7385
|
}
|
6567
7386
|
|
7387
|
+
if (il == n_layer - 1) {
|
7388
|
+
// skip computing output for unused tokens
|
7389
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7390
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7391
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7392
|
+
}
|
7393
|
+
|
6568
7394
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6569
7395
|
cb(ffn_inp, "ffn_inp", il);
|
6570
7396
|
|
@@ -6722,6 +7548,13 @@ struct llm_build_context {
|
|
6722
7548
|
}
|
6723
7549
|
cb(cur, "kqv_out", il);
|
6724
7550
|
|
7551
|
+
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
7552
|
+
// skip computing output for unused tokens
|
7553
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7554
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7555
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7556
|
+
}
|
7557
|
+
|
6725
7558
|
// re-add the layer input
|
6726
7559
|
cur = ggml_add(ctx0, cur, inpL);
|
6727
7560
|
|
@@ -6844,6 +7677,13 @@ struct llm_build_context {
|
|
6844
7677
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6845
7678
|
}
|
6846
7679
|
|
7680
|
+
if (il == n_layer - 1) {
|
7681
|
+
// skip computing output for unused tokens
|
7682
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7683
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7684
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7685
|
+
}
|
7686
|
+
|
6847
7687
|
// Add the input
|
6848
7688
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6849
7689
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -6891,6 +7731,7 @@ struct llm_build_context {
|
|
6891
7731
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6892
7732
|
|
6893
7733
|
struct ggml_tensor * cur;
|
7734
|
+
struct ggml_tensor * pos;
|
6894
7735
|
struct ggml_tensor * inpL;
|
6895
7736
|
|
6896
7737
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -6901,6 +7742,16 @@ struct llm_build_context {
|
|
6901
7742
|
// positions of the tokens in the KV cache
|
6902
7743
|
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6903
7744
|
|
7745
|
+
if (model.pos_embd) {
|
7746
|
+
// inp_pos - contains the positions
|
7747
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7748
|
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7749
|
+
cb(pos, "pos_embd", -1);
|
7750
|
+
|
7751
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7752
|
+
cb(inpL, "inpL", -1);
|
7753
|
+
}
|
7754
|
+
|
6904
7755
|
for (int il = 0; il < n_layer; ++il) {
|
6905
7756
|
struct ggml_tensor * attn_norm;
|
6906
7757
|
|
@@ -6935,11 +7786,39 @@ struct llm_build_context {
|
|
6935
7786
|
cb(Kcur, "Kcur", il);
|
6936
7787
|
cb(Vcur, "Vcur", il);
|
6937
7788
|
|
6938
|
-
|
7789
|
+
// Q/K Layernorm
|
7790
|
+
if (model.layers[il].attn_q_norm) {
|
7791
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
7792
|
+
model.layers[il].attn_q_norm,
|
7793
|
+
model.layers[il].attn_q_norm_b,
|
7794
|
+
LLM_NORM, cb, il);
|
7795
|
+
cb(Qcur, "Qcur", il);
|
6939
7796
|
|
6940
|
-
|
7797
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
7798
|
+
model.layers[il].attn_k_norm,
|
7799
|
+
model.layers[il].attn_k_norm_b,
|
7800
|
+
LLM_NORM, cb, il);
|
7801
|
+
cb(Kcur, "Kcur", il);
|
7802
|
+
|
7803
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7804
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7805
|
+
|
7806
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6941
7807
|
model.layers[il].wo, model.layers[il].bo,
|
6942
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
7808
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7809
|
+
} else {
|
7810
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7811
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7812
|
+
model.layers[il].wo, model.layers[il].bo,
|
7813
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7814
|
+
}
|
7815
|
+
}
|
7816
|
+
|
7817
|
+
if (il == n_layer - 1) {
|
7818
|
+
// skip computing output for unused tokens
|
7819
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7820
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7821
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
6943
7822
|
}
|
6944
7823
|
|
6945
7824
|
// Add the input
|
@@ -7055,6 +7934,13 @@ struct llm_build_context {
|
|
7055
7934
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7056
7935
|
}
|
7057
7936
|
|
7937
|
+
if (il == n_layer - 1) {
|
7938
|
+
// skip computing output for unused tokens
|
7939
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7940
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7941
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7942
|
+
}
|
7943
|
+
|
7058
7944
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7059
7945
|
cb(ffn_inp, "ffn_inp", il);
|
7060
7946
|
|
@@ -7161,6 +8047,13 @@ struct llm_build_context {
|
|
7161
8047
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7162
8048
|
}
|
7163
8049
|
|
8050
|
+
if (il == n_layer - 1) {
|
8051
|
+
// skip computing output for unused tokens
|
8052
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8053
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8054
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8055
|
+
}
|
8056
|
+
|
7164
8057
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7165
8058
|
cb(ffn_inp, "ffn_inp", il);
|
7166
8059
|
|
@@ -7273,6 +8166,13 @@ struct llm_build_context {
|
|
7273
8166
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7274
8167
|
}
|
7275
8168
|
|
8169
|
+
if (il == n_layer - 1) {
|
8170
|
+
// skip computing output for unused tokens
|
8171
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8172
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8173
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8174
|
+
}
|
8175
|
+
|
7276
8176
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7277
8177
|
cb(ffn_inp, "ffn_inp", il);
|
7278
8178
|
|
@@ -7391,6 +8291,14 @@ struct llm_build_context {
|
|
7391
8291
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7392
8292
|
}
|
7393
8293
|
|
8294
|
+
if (il == n_layer - 1) {
|
8295
|
+
// skip computing output for unused tokens
|
8296
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8297
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8298
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8299
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
8300
|
+
}
|
8301
|
+
|
7394
8302
|
// FF
|
7395
8303
|
{
|
7396
8304
|
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
@@ -7488,6 +8396,14 @@ struct llm_build_context {
|
|
7488
8396
|
|
7489
8397
|
cur = attention_norm;
|
7490
8398
|
|
8399
|
+
if (il == n_layer - 1) {
|
8400
|
+
// skip computing output for unused tokens
|
8401
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8402
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8403
|
+
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
|
8404
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8405
|
+
}
|
8406
|
+
|
7491
8407
|
// feed-forward network
|
7492
8408
|
{
|
7493
8409
|
cur = llm_build_ffn(ctx0, cur,
|
@@ -7580,6 +8496,13 @@ struct llm_build_context {
|
|
7580
8496
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7581
8497
|
}
|
7582
8498
|
|
8499
|
+
if (il == n_layer - 1) {
|
8500
|
+
// skip computing output for unused tokens
|
8501
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8502
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8503
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8504
|
+
}
|
8505
|
+
|
7583
8506
|
// add the input
|
7584
8507
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7585
8508
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7680,6 +8603,13 @@ struct llm_build_context {
|
|
7680
8603
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7681
8604
|
}
|
7682
8605
|
|
8606
|
+
if (il == n_layer - 1) {
|
8607
|
+
// skip computing output for unused tokens
|
8608
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8609
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8610
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
8611
|
+
}
|
8612
|
+
|
7683
8613
|
// add the input
|
7684
8614
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7685
8615
|
cb(ffn_inp, "ffn_inp", il);
|
@@ -7789,6 +8719,13 @@ struct llm_build_context {
|
|
7789
8719
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7790
8720
|
}
|
7791
8721
|
|
8722
|
+
if (il == n_layer - 1) {
|
8723
|
+
// skip computing output for unused tokens
|
8724
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8725
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8726
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8727
|
+
}
|
8728
|
+
|
7792
8729
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7793
8730
|
cb(ffn_inp, "ffn_inp", il);
|
7794
8731
|
|
@@ -7899,6 +8836,13 @@ struct llm_build_context {
|
|
7899
8836
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7900
8837
|
}
|
7901
8838
|
|
8839
|
+
if (il == n_layer - 1) {
|
8840
|
+
// skip computing output for unused tokens
|
8841
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8842
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8843
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8844
|
+
}
|
8845
|
+
|
7902
8846
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7903
8847
|
cb(ffn_inp, "ffn_inp", il);
|
7904
8848
|
|
@@ -8022,6 +8966,13 @@ struct llm_build_context {
|
|
8022
8966
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8023
8967
|
}
|
8024
8968
|
|
8969
|
+
if (il == n_layer - 1) {
|
8970
|
+
// skip computing output for unused tokens
|
8971
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8972
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8973
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8974
|
+
}
|
8975
|
+
|
8025
8976
|
// scale_res - scale the hidden states for residual connection
|
8026
8977
|
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
8027
8978
|
cur = ggml_scale(ctx0, cur, scale_res);
|
@@ -8136,6 +9087,13 @@ struct llm_build_context {
|
|
8136
9087
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8137
9088
|
}
|
8138
9089
|
|
9090
|
+
if (il == n_layer - 1) {
|
9091
|
+
// skip computing output for unused tokens
|
9092
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9093
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9094
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9095
|
+
}
|
9096
|
+
|
8139
9097
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
8140
9098
|
cb(sa_out, "sa_out", il);
|
8141
9099
|
|
@@ -8248,6 +9206,13 @@ struct llm_build_context {
|
|
8248
9206
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8249
9207
|
}
|
8250
9208
|
|
9209
|
+
if (il == n_layer - 1) {
|
9210
|
+
// skip computing output for unused tokens
|
9211
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9212
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9213
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
9214
|
+
}
|
9215
|
+
|
8251
9216
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8252
9217
|
cb(ffn_inp, "ffn_inp", il);
|
8253
9218
|
|
@@ -8395,6 +9360,15 @@ struct llm_build_context {
|
|
8395
9360
|
|
8396
9361
|
struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
|
8397
9362
|
|
9363
|
+
if (il == n_layer - 1) {
|
9364
|
+
// skip computing output for unused tokens
|
9365
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9366
|
+
x = ggml_get_rows(ctx0, x, inp_out_ids);
|
9367
|
+
y = ggml_get_rows(ctx0, y, inp_out_ids);
|
9368
|
+
z = ggml_get_rows(ctx0, z, inp_out_ids);
|
9369
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9370
|
+
}
|
9371
|
+
|
8398
9372
|
// {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
|
8399
9373
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
8400
9374
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
|
@@ -8497,6 +9471,14 @@ struct llm_build_context {
|
|
8497
9471
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8498
9472
|
}
|
8499
9473
|
|
9474
|
+
if (il == n_layer - 1) {
|
9475
|
+
// skip computing output for unused tokens
|
9476
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
9477
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9478
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
9479
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
9480
|
+
}
|
9481
|
+
|
8500
9482
|
struct ggml_tensor * attn_out = cur;
|
8501
9483
|
|
8502
9484
|
// feed-forward network
|
@@ -8648,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8648
9630
|
{
|
8649
9631
|
result = llm.build_falcon();
|
8650
9632
|
} break;
|
9633
|
+
case LLM_ARCH_GROK:
|
9634
|
+
{
|
9635
|
+
result = llm.build_grok();
|
9636
|
+
} break;
|
8651
9637
|
case LLM_ARCH_STARCODER:
|
8652
9638
|
{
|
8653
9639
|
result = llm.build_starcoder();
|
@@ -8725,6 +9711,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
8725
9711
|
{
|
8726
9712
|
result = llm.build_mamba();
|
8727
9713
|
} break;
|
9714
|
+
case LLM_ARCH_XVERSE:
|
9715
|
+
{
|
9716
|
+
result = llm.build_xverse();
|
9717
|
+
} break;
|
8728
9718
|
case LLM_ARCH_COMMAND_R:
|
8729
9719
|
{
|
8730
9720
|
result = llm.build_command_r();
|
@@ -8790,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8790
9780
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
8791
9781
|
}
|
8792
9782
|
|
9783
|
+
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
9784
|
+
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
9785
|
+
const int64_t n_tokens = batch.n_tokens;
|
9786
|
+
|
9787
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
9788
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
9789
|
+
|
9790
|
+
if (lctx.n_outputs == n_tokens) {
|
9791
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9792
|
+
data[i] = i;
|
9793
|
+
}
|
9794
|
+
} else if (batch.logits) {
|
9795
|
+
int32_t n_outputs = 0;
|
9796
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9797
|
+
if (batch.logits[i]) {
|
9798
|
+
data[n_outputs++] = i;
|
9799
|
+
}
|
9800
|
+
}
|
9801
|
+
// the graph needs to have been passed the correct number of outputs
|
9802
|
+
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
9803
|
+
} else if (lctx.n_outputs == 1) {
|
9804
|
+
// only keep last output
|
9805
|
+
data[0] = n_tokens - 1;
|
9806
|
+
} else {
|
9807
|
+
GGML_ASSERT(lctx.n_outputs == 0);
|
9808
|
+
}
|
9809
|
+
}
|
9810
|
+
|
8793
9811
|
GGML_ASSERT(
|
9812
|
+
// (!a || b) is a logical implication (a -> b)
|
9813
|
+
// !hparams.causal_attn -> !cparams.causal_attn
|
8794
9814
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
8795
|
-
"
|
9815
|
+
"causal attention with embedding models is not supported"
|
8796
9816
|
);
|
8797
9817
|
|
8798
9818
|
if (lctx.inp_KQ_mask) {
|
@@ -8971,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
8971
9991
|
}
|
8972
9992
|
}
|
8973
9993
|
|
9994
|
+
// Make sure enough space is available for outputs.
|
9995
|
+
// Returns max number of outputs for which space was reserved.
|
9996
|
+
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
9997
|
+
const auto & cparams = lctx.cparams;
|
9998
|
+
const auto & hparams = lctx.model.hparams;
|
9999
|
+
|
10000
|
+
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
10001
|
+
|
10002
|
+
const auto n_batch = cparams.n_batch;
|
10003
|
+
const auto n_vocab = hparams.n_vocab;
|
10004
|
+
const auto n_embd = hparams.n_embd;
|
10005
|
+
|
10006
|
+
// TODO: use a per-batch flag for logits presence instead
|
10007
|
+
const bool has_logits = cparams.causal_attn;
|
10008
|
+
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
10009
|
+
|
10010
|
+
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
10011
|
+
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
10012
|
+
|
10013
|
+
if (lctx.output_ids.empty()) {
|
10014
|
+
// init, never resized afterwards
|
10015
|
+
lctx.output_ids.resize(n_batch);
|
10016
|
+
}
|
10017
|
+
|
10018
|
+
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
|
10019
|
+
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
10020
|
+
|
10021
|
+
// alloc only when more than the current capacity is required
|
10022
|
+
// TODO: also consider shrinking the buffer
|
10023
|
+
if (!lctx.buf_output || prev_size < new_size) {
|
10024
|
+
if (lctx.buf_output) {
|
10025
|
+
#ifndef NDEBUG
|
10026
|
+
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
10027
|
+
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
10028
|
+
#endif
|
10029
|
+
ggml_backend_buffer_free(lctx.buf_output);
|
10030
|
+
lctx.buf_output = nullptr;
|
10031
|
+
lctx.logits = nullptr;
|
10032
|
+
lctx.embd = nullptr;
|
10033
|
+
}
|
10034
|
+
|
10035
|
+
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
10036
|
+
if (lctx.buf_output == nullptr) {
|
10037
|
+
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
10038
|
+
return 0;
|
10039
|
+
}
|
10040
|
+
}
|
10041
|
+
|
10042
|
+
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
|
10043
|
+
|
10044
|
+
lctx.logits = has_logits ? output_base : nullptr;
|
10045
|
+
lctx.embd = has_embd ? output_base + logits_size : nullptr;
|
10046
|
+
|
10047
|
+
lctx.output_size = n_outputs_max;
|
10048
|
+
lctx.logits_size = logits_size;
|
10049
|
+
lctx.embd_size = embd_size;
|
10050
|
+
|
10051
|
+
// set all ids as invalid (negative)
|
10052
|
+
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
|
10053
|
+
|
10054
|
+
ggml_backend_buffer_clear(lctx.buf_output, 0);
|
10055
|
+
|
10056
|
+
lctx.n_outputs = 0;
|
10057
|
+
|
10058
|
+
return n_outputs_max;
|
10059
|
+
}
|
10060
|
+
|
10061
|
+
|
8974
10062
|
static void llama_graph_compute(
|
8975
10063
|
llama_context & lctx,
|
8976
10064
|
ggml_cgraph * gf,
|
@@ -9046,16 +10134,8 @@ static int llama_decode_internal(
|
|
9046
10134
|
const int64_t n_embd = hparams.n_embd;
|
9047
10135
|
const int64_t n_vocab = hparams.n_vocab;
|
9048
10136
|
|
9049
|
-
|
9050
|
-
|
9051
|
-
|
9052
|
-
#ifndef NDEBUG
|
9053
|
-
auto & logits_valid = lctx.logits_valid;
|
9054
|
-
logits_valid.clear();
|
9055
|
-
logits_valid.resize(n_tokens_all);
|
9056
|
-
|
9057
|
-
memset(logits_out, 0, lctx.logits_size*sizeof(float));
|
9058
|
-
#endif
|
10137
|
+
uint32_t n_outputs = 0;
|
10138
|
+
uint32_t n_outputs_prev = 0;
|
9059
10139
|
|
9060
10140
|
const auto n_ubatch = cparams.n_ubatch;
|
9061
10141
|
|
@@ -9064,6 +10144,38 @@ static int llama_decode_internal(
|
|
9064
10144
|
std::vector<llama_seq_id *> seq_id_arr;
|
9065
10145
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
9066
10146
|
|
10147
|
+
// count outputs
|
10148
|
+
if (batch_all.logits) {
|
10149
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10150
|
+
n_outputs += batch_all.logits[i] != 0;
|
10151
|
+
}
|
10152
|
+
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
10153
|
+
n_outputs = n_tokens_all;
|
10154
|
+
} else {
|
10155
|
+
// keep last output only
|
10156
|
+
n_outputs = 1;
|
10157
|
+
}
|
10158
|
+
|
10159
|
+
// reserve output buffer
|
10160
|
+
if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
|
10161
|
+
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
|
10162
|
+
return -2;
|
10163
|
+
};
|
10164
|
+
|
10165
|
+
// set output mappings
|
10166
|
+
if (batch_all.logits) {
|
10167
|
+
int32_t i_logits = 0;
|
10168
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
10169
|
+
if (batch_all.logits[i]) {
|
10170
|
+
lctx.output_ids[i] = i_logits++;
|
10171
|
+
}
|
10172
|
+
}
|
10173
|
+
} else {
|
10174
|
+
for (uint32_t i = 0; i < n_outputs; ++i) {
|
10175
|
+
lctx.output_ids[i] = i;
|
10176
|
+
}
|
10177
|
+
}
|
10178
|
+
|
9067
10179
|
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
9068
10180
|
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
9069
10181
|
llama_batch u_batch = {
|
@@ -9079,6 +10191,27 @@ static int llama_decode_internal(
|
|
9079
10191
|
/* .all_seq_id = */ batch_all.all_seq_id,
|
9080
10192
|
};
|
9081
10193
|
|
10194
|
+
// count the outputs in this u_batch
|
10195
|
+
{
|
10196
|
+
int32_t n_outputs_new = 0;
|
10197
|
+
|
10198
|
+
if (u_batch.logits) {
|
10199
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
10200
|
+
n_outputs_new += u_batch.logits[i] != 0;
|
10201
|
+
}
|
10202
|
+
} else if (n_outputs == n_tokens_all) {
|
10203
|
+
n_outputs_new = n_tokens;
|
10204
|
+
} else {
|
10205
|
+
// keep last output only
|
10206
|
+
if (cur_token + n_tokens >= n_tokens_all) {
|
10207
|
+
n_outputs_new = 1;
|
10208
|
+
}
|
10209
|
+
}
|
10210
|
+
|
10211
|
+
// needs to happen before the graph is built
|
10212
|
+
lctx.n_outputs = n_outputs_new;
|
10213
|
+
}
|
10214
|
+
|
9082
10215
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
9083
10216
|
GGML_ASSERT(n_threads > 0);
|
9084
10217
|
|
@@ -9142,23 +10275,37 @@ static int llama_decode_internal(
|
|
9142
10275
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
9143
10276
|
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
9144
10277
|
|
9145
|
-
if (
|
10278
|
+
if (lctx.n_outputs == 0) {
|
10279
|
+
// no output
|
10280
|
+
res = nullptr;
|
10281
|
+
embd = nullptr;
|
10282
|
+
} else if (!hparams.causal_attn) {
|
9146
10283
|
res = nullptr; // do not extract logits for embedding models such as BERT
|
9147
10284
|
|
9148
10285
|
// token or sequence embeddings
|
9149
10286
|
embd = gf->nodes[gf->n_nodes - 1];
|
9150
10287
|
|
9151
10288
|
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
9152
|
-
} else {
|
9153
|
-
|
9154
|
-
|
9155
|
-
|
9156
|
-
|
9157
|
-
|
9158
|
-
|
9159
|
-
}
|
9160
|
-
|
10289
|
+
} else if (cparams.embeddings) {
|
10290
|
+
// the embeddings could be in the second to last tensor, or any of the previous tensors
|
10291
|
+
int i_embd = gf->n_nodes - 2;
|
10292
|
+
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
|
10293
|
+
i_embd = gf->n_nodes - i;
|
10294
|
+
if (i_embd < 0) { break; }
|
10295
|
+
embd = gf->nodes[i_embd];
|
10296
|
+
}
|
10297
|
+
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
10298
|
+
|
10299
|
+
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
10300
|
+
if (!cparams.causal_attn) {
|
10301
|
+
res = nullptr; // do not extract logits when not needed
|
10302
|
+
// skip computing logits
|
10303
|
+
// TODO: is this safe?
|
10304
|
+
gf->n_nodes = i_embd + 1;
|
9161
10305
|
}
|
10306
|
+
} else {
|
10307
|
+
embd = nullptr; // do not extract embeddings when not needed
|
10308
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
9162
10309
|
}
|
9163
10310
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
9164
10311
|
|
@@ -9201,50 +10348,23 @@ static int llama_decode_internal(
|
|
9201
10348
|
//}
|
9202
10349
|
|
9203
10350
|
// extract logits
|
9204
|
-
// TODO: do not compute and extract logits if only embeddings are needed
|
9205
|
-
// update the graphs to skip "result_output" if logits are not needed
|
9206
10351
|
if (res) {
|
9207
10352
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
9208
10353
|
GGML_ASSERT(backend_res != nullptr);
|
9209
|
-
|
9210
|
-
|
9211
|
-
|
9212
|
-
|
9213
|
-
|
9214
|
-
|
9215
|
-
|
9216
|
-
|
9217
|
-
|
9218
|
-
// extract logits for the range [i_first, i_last)
|
9219
|
-
// group the requests to minimize the number of calls to the backend
|
9220
|
-
ggml_backend_tensor_get_async(backend_res, res,
|
9221
|
-
logits_out + n_vocab*(cur_token + i_first),
|
9222
|
-
i_first*n_vocab*sizeof(float),
|
9223
|
-
(i_last - i_first)*n_vocab*sizeof(float));
|
9224
|
-
i_first = -1;
|
9225
|
-
}
|
9226
|
-
}
|
9227
|
-
#ifndef NDEBUG
|
9228
|
-
logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
|
9229
|
-
#endif
|
9230
|
-
}
|
9231
|
-
} else if (lctx.logits_all) {
|
9232
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
|
9233
|
-
#ifndef NDEBUG
|
9234
|
-
std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
|
9235
|
-
#endif
|
9236
|
-
} else {
|
9237
|
-
if (cur_token + n_tokens >= n_tokens_all) {
|
9238
|
-
ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
|
9239
|
-
#ifndef NDEBUG
|
9240
|
-
logits_valid[0] = true;
|
9241
|
-
#endif
|
9242
|
-
}
|
10354
|
+
GGML_ASSERT(lctx.logits != nullptr);
|
10355
|
+
|
10356
|
+
float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
|
10357
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10358
|
+
|
10359
|
+
if (n_outputs_new) {
|
10360
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10361
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
|
10362
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
|
9243
10363
|
}
|
9244
10364
|
}
|
9245
10365
|
|
9246
10366
|
// extract embeddings
|
9247
|
-
if (
|
10367
|
+
if (embd) {
|
9248
10368
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
9249
10369
|
GGML_ASSERT(backend_embd != nullptr);
|
9250
10370
|
|
@@ -9252,16 +10372,14 @@ static int llama_decode_internal(
|
|
9252
10372
|
case LLAMA_POOLING_TYPE_NONE:
|
9253
10373
|
{
|
9254
10374
|
// extract token embeddings
|
9255
|
-
|
9256
|
-
|
9257
|
-
|
9258
|
-
|
9259
|
-
|
9260
|
-
|
9261
|
-
|
9262
|
-
|
9263
|
-
ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
9264
|
-
}
|
10375
|
+
GGML_ASSERT(lctx.embd != nullptr);
|
10376
|
+
float * embd_out = lctx.embd + n_outputs_prev*n_embd;
|
10377
|
+
const int32_t n_outputs_new = lctx.n_outputs;
|
10378
|
+
|
10379
|
+
if (n_outputs_new) {
|
10380
|
+
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
10381
|
+
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
|
10382
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
9265
10383
|
}
|
9266
10384
|
} break;
|
9267
10385
|
case LLAMA_POOLING_TYPE_CLS:
|
@@ -9288,6 +10406,7 @@ static int llama_decode_internal(
|
|
9288
10406
|
} break;
|
9289
10407
|
}
|
9290
10408
|
}
|
10409
|
+
n_outputs_prev += lctx.n_outputs;
|
9291
10410
|
}
|
9292
10411
|
|
9293
10412
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
@@ -10218,7 +11337,7 @@ struct llm_tokenizer_wpm {
|
|
10218
11337
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
10219
11338
|
continue;
|
10220
11339
|
}
|
10221
|
-
code =
|
11340
|
+
code = unicode_tolower(code);
|
10222
11341
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
10223
11342
|
code = ' ';
|
10224
11343
|
}
|
@@ -10238,7 +11357,7 @@ struct llm_tokenizer_wpm {
|
|
10238
11357
|
std::vector<std::string> words;
|
10239
11358
|
while (r < new_str.size()) {
|
10240
11359
|
// if is whitespace
|
10241
|
-
if (isspace(new_str[r])) {
|
11360
|
+
if (isspace(new_str[r], std::locale::classic())) {
|
10242
11361
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
10243
11362
|
l = r + 1;
|
10244
11363
|
r = l;
|
@@ -10252,18 +11371,12 @@ struct llm_tokenizer_wpm {
|
|
10252
11371
|
return words;
|
10253
11372
|
}
|
10254
11373
|
|
10255
|
-
uint32_t to_lower(uint32_t code) {
|
10256
|
-
static const std::locale locale("en_US.UTF-8");
|
10257
|
-
#if defined(_WIN32)
|
10258
|
-
if (code > 0xFFFF) {
|
10259
|
-
return code;
|
10260
|
-
}
|
10261
|
-
#endif
|
10262
|
-
return std::tolower(wchar_t(code), locale);
|
10263
|
-
}
|
10264
|
-
|
10265
11374
|
bool is_ascii_punct(uint32_t code) {
|
10266
|
-
|
11375
|
+
if (code > 0xFF) {
|
11376
|
+
return false;
|
11377
|
+
}
|
11378
|
+
auto c = char(static_cast<unsigned char>(code));
|
11379
|
+
return ispunct(c, std::locale::classic());
|
10267
11380
|
}
|
10268
11381
|
|
10269
11382
|
bool is_chinese_char(uint32_t cpt) {
|
@@ -10508,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
10508
11621
|
// grammar - internal
|
10509
11622
|
//
|
10510
11623
|
|
10511
|
-
struct llama_partial_utf8 {
|
10512
|
-
uint32_t value; // bit value so far (unshifted)
|
10513
|
-
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
10514
|
-
};
|
10515
|
-
|
10516
|
-
struct llama_grammar {
|
10517
|
-
const std::vector<std::vector<llama_grammar_element>> rules;
|
10518
|
-
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
10519
|
-
|
10520
|
-
// buffer for partially generated UTF-8 sequence from accepted tokens
|
10521
|
-
llama_partial_utf8 partial_utf8;
|
10522
|
-
};
|
10523
|
-
|
10524
|
-
struct llama_grammar_candidate {
|
10525
|
-
size_t index;
|
10526
|
-
const uint32_t * code_points;
|
10527
|
-
llama_partial_utf8 partial_utf8;
|
10528
|
-
};
|
10529
11624
|
|
10530
11625
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
10531
11626
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
10532
|
-
|
11627
|
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
10533
11628
|
const std::string & src,
|
10534
11629
|
llama_partial_utf8 partial_start) {
|
10535
11630
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -10731,7 +11826,7 @@ static void llama_grammar_advance_stack(
|
|
10731
11826
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
10732
11827
|
// produces the N possible stacks if the given char is accepted at those
|
10733
11828
|
// positions
|
10734
|
-
|
11829
|
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
10735
11830
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
10736
11831
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
10737
11832
|
const uint32_t chr) {
|
@@ -11957,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11957
13052
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
11958
13053
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
11959
13054
|
// tensor name.
|
11960
|
-
n_layer /= n_expert;
|
11961
13055
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
11962
13056
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
11963
13057
|
}
|
@@ -11971,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
11971
13065
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
11972
13066
|
// with the quantization of the output tensor
|
11973
13067
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
11974
|
-
|
11975
|
-
|
11976
|
-
|
11977
|
-
|
11978
|
-
|
11979
|
-
|
11980
|
-
|
11981
|
-
|
11982
|
-
|
11983
|
-
|
13068
|
+
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
13069
|
+
new_type = qs.params->output_tensor_type;
|
13070
|
+
} else {
|
13071
|
+
int nx = tensor->ne[0];
|
13072
|
+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
13073
|
+
new_type = GGML_TYPE_Q8_0;
|
13074
|
+
}
|
13075
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
13076
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
13077
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13078
|
+
new_type = GGML_TYPE_Q5_K;
|
13079
|
+
}
|
13080
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
13081
|
+
new_type = GGML_TYPE_Q6_K;
|
13082
|
+
}
|
11984
13083
|
}
|
11985
13084
|
} else if (name == "token_embd.weight") {
|
11986
|
-
if (
|
11987
|
-
|
11988
|
-
|
11989
|
-
|
11990
|
-
|
11991
|
-
|
11992
|
-
|
11993
|
-
|
11994
|
-
|
13085
|
+
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
13086
|
+
new_type = qs.params->token_embedding_type;
|
13087
|
+
} else {
|
13088
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
13089
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
13090
|
+
new_type = GGML_TYPE_Q2_K;
|
13091
|
+
}
|
13092
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13093
|
+
new_type = GGML_TYPE_IQ3_S;
|
13094
|
+
}
|
13095
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
13096
|
+
new_type = GGML_TYPE_IQ3_S;
|
13097
|
+
}
|
11995
13098
|
}
|
11996
13099
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
11997
|
-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
13100
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
11998
13101
|
if (name.find("attn_v.weight") != std::string::npos) {
|
11999
13102
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
12000
13103
|
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
@@ -12013,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12013
13116
|
if (qs.model.hparams.n_expert == 8) {
|
12014
13117
|
new_type = GGML_TYPE_Q5_K;
|
12015
13118
|
} else {
|
12016
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
13119
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
|
12017
13120
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
12018
13121
|
}
|
12019
13122
|
}
|
@@ -12027,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12027
13130
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
12028
13131
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
12029
13132
|
}
|
12030
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
12031
|
-
new_type = GGML_TYPE_Q4_K;
|
12032
|
-
}
|
12033
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
12034
|
-
new_type = GGML_TYPE_Q4_K;
|
12035
|
-
}
|
12036
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
13133
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
12037
13134
|
new_type = GGML_TYPE_Q4_K;
|
12038
13135
|
}
|
12039
13136
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
@@ -12186,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12186
13283
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
12187
13284
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
12188
13285
|
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
12189
|
-
new_type == GGML_TYPE_IQ3_XXS ||
|
13286
|
+
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
13287
|
+
new_type == GGML_TYPE_IQ1_M) {
|
12190
13288
|
int nx = tensor->ne[0];
|
12191
13289
|
int ny = tensor->ne[1];
|
12192
13290
|
if (nx % QK_K != 0) {
|
@@ -12204,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
12204
13302
|
case GGML_TYPE_IQ3_XXS:
|
12205
13303
|
case GGML_TYPE_IQ3_S:
|
12206
13304
|
case GGML_TYPE_IQ1_S:
|
13305
|
+
case GGML_TYPE_IQ1_M:
|
12207
13306
|
case GGML_TYPE_Q2_K:
|
12208
13307
|
case GGML_TYPE_Q3_K:
|
12209
13308
|
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
@@ -12285,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12285
13384
|
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
|
12286
13385
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
|
12287
13386
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
|
13387
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
|
12288
13388
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
|
12289
13389
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
12290
13390
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
@@ -12307,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12307
13407
|
constexpr bool use_mmap = false;
|
12308
13408
|
#endif
|
12309
13409
|
|
12310
|
-
|
12311
|
-
|
13410
|
+
llama_model_kv_override * kv_overrides = nullptr;
|
13411
|
+
if (params->kv_overrides) {
|
13412
|
+
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
13413
|
+
kv_overrides = v->data();
|
13414
|
+
}
|
13415
|
+
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
13416
|
+
ml.init_mappings(false); // no prefetching
|
12312
13417
|
|
12313
13418
|
llama_model model;
|
12314
13419
|
llm_load_arch(ml, model);
|
@@ -12332,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12332
13437
|
struct gguf_context * ctx_out = gguf_init_empty();
|
12333
13438
|
|
12334
13439
|
// copy the KV pairs from the input file
|
12335
|
-
gguf_set_kv (ctx_out, ml.
|
13440
|
+
gguf_set_kv (ctx_out, ml.meta);
|
12336
13441
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
12337
13442
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
12338
13443
|
|
13444
|
+
if (params->kv_overrides) {
|
13445
|
+
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
13446
|
+
for (auto & o : overrides) {
|
13447
|
+
if (o.key[0] == 0) break;
|
13448
|
+
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
13449
|
+
gguf_set_val_f32(ctx_out, o.key, o.float_value);
|
13450
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
13451
|
+
gguf_set_val_i32(ctx_out, o.key, o.int_value);
|
13452
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
13453
|
+
gguf_set_val_bool(ctx_out, o.key, o.bool_value);
|
13454
|
+
} else {
|
13455
|
+
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
13456
|
+
}
|
13457
|
+
}
|
13458
|
+
}
|
13459
|
+
|
12339
13460
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12340
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13461
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12341
13462
|
|
12342
13463
|
const std::string name = ggml_get_name(meta);
|
12343
13464
|
|
12344
13465
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
12345
13466
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
12346
13467
|
++qs.n_attention_wv;
|
12347
|
-
}
|
12348
|
-
else if (name.find("ffn_down") != std::string::npos) {
|
12349
|
-
++qs.n_ffn_down;
|
12350
|
-
}
|
12351
|
-
else if (name.find("ffn_gate") != std::string::npos) {
|
12352
|
-
++qs.n_ffn_gate;
|
12353
|
-
}
|
12354
|
-
else if (name.find("ffn_up") != std::string::npos) {
|
12355
|
-
++qs.n_ffn_up;
|
12356
|
-
}
|
12357
|
-
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13468
|
+
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
12358
13469
|
qs.has_output = true;
|
12359
13470
|
}
|
12360
13471
|
}
|
12361
|
-
|
12362
|
-
|
12363
|
-
|
12364
|
-
|
13472
|
+
|
13473
|
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
|
+
|
13475
|
+
// sanity checks
|
13476
|
+
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
12365
13477
|
|
12366
13478
|
size_t total_size_org = 0;
|
12367
13479
|
size_t total_size_new = 0;
|
@@ -12377,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12377
13489
|
|
12378
13490
|
// populate the original tensors so we get an initial meta data
|
12379
13491
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12380
|
-
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
13492
|
+
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
12381
13493
|
gguf_add_tensor(ctx_out, meta);
|
12382
13494
|
}
|
12383
13495
|
|
@@ -12391,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12391
13503
|
// placeholder for the meta data
|
12392
13504
|
::zeros(fout, meta_size);
|
12393
13505
|
|
13506
|
+
const auto tn = LLM_TN(model.arch);
|
13507
|
+
|
12394
13508
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
12395
13509
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
12396
13510
|
|
@@ -12413,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12413
13527
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
12414
13528
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
12415
13529
|
|
12416
|
-
// quantize only 2D tensors
|
12417
|
-
quantize &= (ggml_n_dims(tensor)
|
13530
|
+
// quantize only 2D and 3D tensors (experts)
|
13531
|
+
quantize &= (ggml_n_dims(tensor) >= 2);
|
12418
13532
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
12419
13533
|
quantize &= !params->only_copy;
|
12420
13534
|
|
@@ -12443,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12443
13557
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
12444
13558
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
12445
13559
|
}
|
13560
|
+
else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
|
+
new_type = params->token_embedding_type;
|
13562
|
+
}
|
13563
|
+
else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
|
+
new_type = params->output_tensor_type;
|
13565
|
+
}
|
12446
13566
|
|
12447
13567
|
// If we've decided to quantize to the same type the tensor is already
|
12448
13568
|
// in then there's nothing to do.
|
@@ -12463,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12463
13583
|
if (it == imatrix_data->end()) {
|
12464
13584
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
12465
13585
|
} else {
|
12466
|
-
if (it->second.size() == (size_t)tensor->ne[0]) {
|
13586
|
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
12467
13587
|
imatrix = it->second.data();
|
12468
13588
|
} else {
|
12469
13589
|
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
12470
|
-
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
13590
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
13591
|
+
|
13592
|
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
13593
|
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
13594
|
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
13595
|
+
// tok_embd should be ignored in this case, since it always causes this warning
|
13596
|
+
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
13597
|
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
13598
|
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
13599
|
+
}
|
12471
13600
|
}
|
12472
13601
|
}
|
12473
13602
|
}
|
@@ -12475,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12475
13604
|
new_type == GGML_TYPE_IQ2_XS ||
|
12476
13605
|
new_type == GGML_TYPE_IQ2_S ||
|
12477
13606
|
new_type == GGML_TYPE_IQ1_S ||
|
13607
|
+
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
12478
13608
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
12479
13609
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
12480
13610
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
@@ -12503,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
12503
13633
|
new_data = work.data();
|
12504
13634
|
|
12505
13635
|
const int n_per_row = tensor->ne[0];
|
12506
|
-
const int nrows =
|
13636
|
+
const int nrows = tensor->ne[1];
|
12507
13637
|
|
12508
13638
|
static const int min_chunk_size = 32 * 512;
|
12509
13639
|
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
12510
13640
|
|
12511
|
-
const int
|
13641
|
+
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
13642
|
+
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
12512
13643
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
12513
|
-
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
|
12514
13644
|
|
13645
|
+
// quantize each expert separately since they have different importance matrices
|
13646
|
+
new_size = 0;
|
13647
|
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
13648
|
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
13649
|
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
13650
|
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
13651
|
+
|
13652
|
+
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
13653
|
+
}
|
12515
13654
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
12516
13655
|
}
|
12517
13656
|
total_size_org += ggml_nbytes(tensor);
|
@@ -12582,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12582
13721
|
if (path_base_model) {
|
12583
13722
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
12584
13723
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
12585
|
-
ml->
|
13724
|
+
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
12586
13725
|
}
|
12587
13726
|
|
12588
13727
|
struct tensor_meta {
|
@@ -12703,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
|
|
12703
13842
|
|
12704
13843
|
ggml_tensor * base_t;
|
12705
13844
|
if (ml) {
|
12706
|
-
if (
|
13845
|
+
if (!ml->get_tensor_meta(base_name.c_str())) {
|
12707
13846
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
12708
13847
|
return 1;
|
12709
13848
|
}
|
@@ -12887,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12887
14026
|
struct llama_model_quantize_params result = {
|
12888
14027
|
/*.nthread =*/ 0,
|
12889
14028
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
14029
|
+
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
14030
|
+
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
12890
14031
|
/*.allow_requantize =*/ false,
|
12891
14032
|
/*.quantize_output_tensor =*/ true,
|
12892
14033
|
/*.only_copy =*/ false,
|
12893
14034
|
/*.pure =*/ false,
|
12894
14035
|
/*.imatrix =*/ nullptr,
|
14036
|
+
/*.kv_overrides =*/ nullptr,
|
12895
14037
|
};
|
12896
14038
|
|
12897
14039
|
return result;
|
@@ -12900,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
12900
14042
|
size_t llama_max_devices(void) {
|
12901
14043
|
#if defined(GGML_USE_METAL)
|
12902
14044
|
return 1;
|
12903
|
-
#elif defined(
|
14045
|
+
#elif defined(GGML_USE_CUDA)
|
12904
14046
|
return GGML_CUDA_MAX_DEVICES;
|
12905
14047
|
#elif defined(GGML_USE_SYCL)
|
12906
14048
|
return GGML_SYCL_MAX_DEVICES;
|
@@ -12920,8 +14062,8 @@ bool llama_supports_mlock(void) {
|
|
12920
14062
|
}
|
12921
14063
|
|
12922
14064
|
bool llama_supports_gpu_offload(void) {
|
12923
|
-
#if defined(
|
12924
|
-
defined(GGML_USE_SYCL)
|
14065
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
14066
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
12925
14067
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
12926
14068
|
return true;
|
12927
14069
|
#else
|
@@ -13028,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13028
14170
|
const auto & hparams = model->hparams;
|
13029
14171
|
auto & cparams = ctx->cparams;
|
13030
14172
|
|
13031
|
-
|
14173
|
+
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
13032
14174
|
cparams.n_threads = params.n_threads;
|
13033
14175
|
cparams.n_threads_batch = params.n_threads_batch;
|
13034
14176
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
@@ -13126,7 +14268,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13126
14268
|
}
|
13127
14269
|
ctx->backends.push_back(ctx->backend_metal);
|
13128
14270
|
}
|
13129
|
-
#elif defined(
|
14271
|
+
#elif defined(GGML_USE_CUDA)
|
13130
14272
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
13131
14273
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
13132
14274
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
@@ -13149,7 +14291,20 @@ struct llama_context * llama_new_context_with_model(
|
|
13149
14291
|
}
|
13150
14292
|
}
|
13151
14293
|
#elif defined(GGML_USE_VULKAN)
|
13152
|
-
if (model->
|
14294
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14295
|
+
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
14296
|
+
llama_free(ctx);
|
14297
|
+
return nullptr;
|
14298
|
+
}
|
14299
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
14300
|
+
ggml_backend_t backend = ggml_backend_vk_init(0);
|
14301
|
+
if (backend == nullptr) {
|
14302
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
14303
|
+
llama_free(ctx);
|
14304
|
+
return nullptr;
|
14305
|
+
}
|
14306
|
+
ctx->backends.push_back(backend);
|
14307
|
+
} else {
|
13153
14308
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
13154
14309
|
ggml_backend_t backend = ggml_backend_vk_init(device);
|
13155
14310
|
if (backend == nullptr) {
|
@@ -13161,30 +14316,28 @@ struct llama_context * llama_new_context_with_model(
|
|
13161
14316
|
}
|
13162
14317
|
}
|
13163
14318
|
#elif defined(GGML_USE_SYCL)
|
13164
|
-
|
13165
|
-
|
13166
|
-
|
13167
|
-
|
14319
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
14320
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
14321
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
14322
|
+
if (backend == nullptr) {
|
14323
|
+
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
14324
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
14325
|
+
llama_free(ctx);
|
14326
|
+
return nullptr;
|
14327
|
+
}
|
14328
|
+
ctx->backends.push_back(backend);
|
14329
|
+
} else {
|
14330
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
14331
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
14332
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
13168
14333
|
if (backend == nullptr) {
|
13169
|
-
int
|
13170
|
-
|
14334
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
14335
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
14336
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
13171
14337
|
llama_free(ctx);
|
13172
14338
|
return nullptr;
|
13173
14339
|
}
|
13174
14340
|
ctx->backends.push_back(backend);
|
13175
|
-
} else {
|
13176
|
-
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
13177
|
-
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
13178
|
-
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
13179
|
-
if (backend == nullptr) {
|
13180
|
-
int id_list[GGML_SYCL_MAX_DEVICES];
|
13181
|
-
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
13182
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
13183
|
-
llama_free(ctx);
|
13184
|
-
return nullptr;
|
13185
|
-
}
|
13186
|
-
ctx->backends.push_back(backend);
|
13187
|
-
}
|
13188
14341
|
}
|
13189
14342
|
}
|
13190
14343
|
#elif defined(GGML_USE_KOMPUTE)
|
@@ -13232,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
|
|
13232
14385
|
|
13233
14386
|
// graph outputs buffer
|
13234
14387
|
{
|
13235
|
-
// resized during inference
|
13236
|
-
ctx
|
13237
|
-
|
13238
|
-
|
13239
|
-
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
|
13240
|
-
|
13241
|
-
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
|
13242
|
-
if (ctx->buf_output == nullptr) {
|
13243
|
-
LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
|
14388
|
+
// resized during inference when a batch uses more outputs
|
14389
|
+
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
14390
|
+
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
13244
14391
|
llama_free(ctx);
|
13245
14392
|
return nullptr;
|
13246
14393
|
}
|
13247
|
-
ggml_backend_buffer_clear(ctx->buf_output, 0);
|
13248
|
-
|
13249
|
-
|
13250
|
-
ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
|
13251
|
-
if (params.embeddings) {
|
13252
|
-
ctx->embd = ctx->logits + ctx->logits_size;
|
13253
|
-
}
|
13254
14394
|
|
13255
14395
|
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
|
13256
14396
|
ggml_backend_buffer_name(ctx->buf_output),
|
@@ -13275,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
|
|
13275
14415
|
|
13276
14416
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
13277
14417
|
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
13278
|
-
#ifndef
|
14418
|
+
#ifndef GGML_USE_CUDA
|
13279
14419
|
// pipeline parallelism requires support for async compute and events
|
13280
14420
|
// currently this is only implemented in the CUDA backend
|
13281
14421
|
pipeline_parallel = false;
|
@@ -13383,11 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
13383
14523
|
case LLM_ARCH_ORION:
|
13384
14524
|
case LLM_ARCH_INTERNLM2:
|
13385
14525
|
case LLM_ARCH_MINICPM:
|
14526
|
+
case LLM_ARCH_XVERSE:
|
13386
14527
|
case LLM_ARCH_COMMAND_R:
|
13387
14528
|
return LLAMA_ROPE_TYPE_NORM;
|
13388
14529
|
|
13389
14530
|
// the pairs of head values are offset by n_rot/2
|
13390
14531
|
case LLM_ARCH_FALCON:
|
14532
|
+
case LLM_ARCH_GROK:
|
13391
14533
|
case LLM_ARCH_PERSIMMON:
|
13392
14534
|
case LLM_ARCH_BERT:
|
13393
14535
|
case LLM_ARCH_NOMIC_BERT:
|
@@ -13766,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
13766
14908
|
|
13767
14909
|
// Returns the *maximum* size of the state
|
13768
14910
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
14911
|
+
const auto & cparams = ctx->cparams;
|
14912
|
+
const auto & hparams = ctx->model.hparams;
|
14913
|
+
|
13769
14914
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
13770
14915
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
13771
14916
|
const size_t s_rng_size = sizeof(size_t);
|
13772
14917
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
14918
|
+
const size_t s_n_outputs = sizeof(size_t);
|
14919
|
+
// assume worst case for outputs although only currently set ones are serialized
|
14920
|
+
const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
|
13773
14921
|
const size_t s_logits_size = sizeof(size_t);
|
13774
|
-
|
13775
|
-
const size_t s_logits = ctx->logits_size * sizeof(float);
|
14922
|
+
const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
|
13776
14923
|
const size_t s_embedding_size = sizeof(size_t);
|
13777
|
-
const size_t s_embedding = ctx->embd_size * sizeof(float);
|
14924
|
+
const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
|
13778
14925
|
const size_t s_kv_buf_size = sizeof(size_t);
|
13779
14926
|
const size_t s_kv_head = sizeof(uint32_t);
|
13780
14927
|
const size_t s_kv_size = sizeof(uint32_t);
|
13781
14928
|
const size_t s_kv_used = sizeof(uint32_t);
|
13782
14929
|
const size_t s_kv = ctx->kv_self.total_size();
|
13783
|
-
|
13784
|
-
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
14930
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
13785
14931
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
13786
14932
|
|
13787
14933
|
const size_t s_total = (
|
13788
14934
|
+ s_rng_size
|
13789
14935
|
+ s_rng
|
14936
|
+
+ s_n_outputs
|
14937
|
+
+ s_output_pos
|
13790
14938
|
+ s_logits_size
|
13791
14939
|
+ s_logits
|
13792
14940
|
+ s_embedding_size
|
@@ -13861,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13861
15009
|
std::ostringstream rng_ss;
|
13862
15010
|
rng_ss << ctx->rng;
|
13863
15011
|
|
13864
|
-
const std::string & rng_str
|
15012
|
+
const std::string & rng_str = rng_ss.str();
|
13865
15013
|
const size_t rng_size = rng_str.size();
|
13866
15014
|
|
13867
15015
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
@@ -13870,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13870
15018
|
data_ctx->write(rng_str.data(), rng_size);
|
13871
15019
|
}
|
13872
15020
|
|
13873
|
-
// copy
|
15021
|
+
// copy outputs
|
13874
15022
|
{
|
13875
|
-
|
15023
|
+
// Can't use ctx->n_outputs because it's not for the
|
15024
|
+
// entire last batch when n_ubatch is smaller than n_batch
|
15025
|
+
size_t n_outputs = 0;
|
13876
15026
|
|
13877
|
-
|
15027
|
+
// copy output ids
|
15028
|
+
{
|
15029
|
+
std::vector<int32_t> output_pos;
|
13878
15030
|
|
13879
|
-
|
13880
|
-
|
15031
|
+
const size_t n_batch = ctx->cparams.n_batch;
|
15032
|
+
const auto & output_ids = ctx->output_ids;
|
15033
|
+
|
15034
|
+
output_pos.resize(ctx->output_size);
|
15035
|
+
|
15036
|
+
// build a more compact representation of the output ids
|
15037
|
+
for (size_t i = 0; i < n_batch; ++i) {
|
15038
|
+
// map an output id to a position in the batch
|
15039
|
+
int32_t pos = output_ids[i];
|
15040
|
+
if (pos >= 0) {
|
15041
|
+
if ((size_t) pos >= n_outputs) {
|
15042
|
+
n_outputs = pos + 1;
|
15043
|
+
}
|
15044
|
+
GGML_ASSERT((size_t) pos < ctx->output_size);
|
15045
|
+
output_pos[pos] = i;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
data_ctx->write(&n_outputs, sizeof(n_outputs));
|
15050
|
+
|
15051
|
+
if (n_outputs) {
|
15052
|
+
data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
|
15053
|
+
}
|
13881
15054
|
}
|
13882
|
-
}
|
13883
15055
|
|
13884
|
-
|
13885
|
-
|
13886
|
-
|
15056
|
+
// copy logits
|
15057
|
+
{
|
15058
|
+
const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
|
13887
15059
|
|
13888
|
-
|
15060
|
+
data_ctx->write(&logits_size, sizeof(logits_size));
|
13889
15061
|
|
13890
|
-
|
13891
|
-
|
15062
|
+
if (logits_size) {
|
15063
|
+
data_ctx->write(ctx->logits, logits_size * sizeof(float));
|
15064
|
+
}
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
// copy embeddings
|
15068
|
+
{
|
15069
|
+
const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
|
15070
|
+
|
15071
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
15072
|
+
|
15073
|
+
if (embeddings_size) {
|
15074
|
+
data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
|
15075
|
+
}
|
13892
15076
|
}
|
13893
15077
|
}
|
13894
15078
|
|
@@ -13901,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13901
15085
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13902
15086
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
13903
15087
|
|
13904
|
-
|
15088
|
+
// NOTE: kv_size and kv_buf_size are mostly used for sanity checks
|
13905
15089
|
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
13906
15090
|
const uint32_t kv_size = kv_self.size;
|
15091
|
+
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
13907
15092
|
const uint32_t kv_used = kv_self.used;
|
13908
15093
|
|
13909
15094
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
@@ -13912,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13912
15097
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
13913
15098
|
|
13914
15099
|
if (kv_buf_size) {
|
15100
|
+
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
15101
|
+
|
13915
15102
|
std::vector<uint8_t> tmp_buf;
|
13916
15103
|
for (int il = 0; il < (int) n_layer; ++il) {
|
13917
15104
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -13941,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
13941
15128
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
13942
15129
|
}
|
13943
15130
|
}
|
15131
|
+
GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
|
13944
15132
|
}
|
13945
15133
|
|
13946
15134
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
@@ -13985,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
13985
15173
|
GGML_ASSERT(!rng_ss.fail());
|
13986
15174
|
}
|
13987
15175
|
|
15176
|
+
// set output ids
|
15177
|
+
{
|
15178
|
+
size_t n_outputs;
|
15179
|
+
std::vector<int32_t> output_pos;
|
15180
|
+
|
15181
|
+
memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
|
15182
|
+
|
15183
|
+
GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
|
15184
|
+
|
15185
|
+
if (n_outputs) {
|
15186
|
+
output_pos.resize(n_outputs);
|
15187
|
+
memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
|
15188
|
+
inp += n_outputs * sizeof(int32_t);
|
15189
|
+
|
15190
|
+
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
|
15191
|
+
int32_t id = output_pos[i];
|
15192
|
+
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15193
|
+
ctx->output_ids[id] = i;
|
15194
|
+
}
|
15195
|
+
}
|
15196
|
+
}
|
15197
|
+
|
13988
15198
|
// set logits
|
13989
15199
|
{
|
13990
15200
|
size_t logits_size;
|
@@ -14005,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14005
15215
|
|
14006
15216
|
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
14007
15217
|
|
14008
|
-
GGML_ASSERT(ctx->embd_size
|
15218
|
+
GGML_ASSERT(ctx->embd_size >= embeddings_size);
|
14009
15219
|
|
14010
15220
|
if (embeddings_size) {
|
14011
15221
|
memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
|
@@ -14032,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14032
15242
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
14033
15243
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
14034
15244
|
|
15245
|
+
if (kv_self.size != kv_size) {
|
15246
|
+
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
15247
|
+
GGML_ASSERT(kv_self.size >= kv_head);
|
15248
|
+
|
15249
|
+
LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
|
15250
|
+
__func__, kv_head, kv_size, kv_self.size);
|
15251
|
+
}
|
15252
|
+
|
14035
15253
|
if (kv_buf_size) {
|
14036
|
-
|
15254
|
+
const size_t pre_kv_buf_size = inp - src;
|
15255
|
+
|
15256
|
+
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
|
14037
15257
|
|
14038
15258
|
for (int il = 0; il < (int) n_layer; ++il) {
|
14039
15259
|
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
@@ -14053,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14053
15273
|
|
14054
15274
|
// v is not contiguous, copy row by row
|
14055
15275
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
14056
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
15276
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
|
14057
15277
|
|
14058
15278
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
14059
15279
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
14060
15280
|
inp += v_row_size;
|
14061
15281
|
}
|
14062
15282
|
}
|
15283
|
+
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
14063
15284
|
}
|
14064
15285
|
|
14065
|
-
|
15286
|
+
llama_kv_cache_clear(ctx);
|
14066
15287
|
|
14067
15288
|
ctx->kv_self.head = kv_head;
|
14068
|
-
ctx->kv_self.size = kv_size;
|
14069
15289
|
ctx->kv_self.used = kv_used;
|
14070
15290
|
|
14071
|
-
ctx->kv_self.cells.resize(kv_size);
|
14072
|
-
|
14073
15291
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
14074
15292
|
llama_pos pos;
|
14075
15293
|
size_t seq_id_size;
|
@@ -14086,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
14086
15304
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
14087
15305
|
}
|
14088
15306
|
}
|
14089
|
-
|
14090
|
-
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
14091
|
-
ctx->kv_self.cells[i].pos = -1;
|
14092
|
-
ctx->kv_self.cells[i].seq_id.clear();
|
14093
|
-
}
|
14094
15307
|
}
|
14095
15308
|
|
14096
15309
|
const size_t nread = inp - src;
|
@@ -14296,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
14296
15509
|
}
|
14297
15510
|
|
14298
15511
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
14299
|
-
assert(ctx->logits_valid.at(i));
|
14300
|
-
|
14301
15512
|
llama_synchronize(ctx);
|
14302
15513
|
|
14303
|
-
|
15514
|
+
try {
|
15515
|
+
if (ctx->logits == nullptr) {
|
15516
|
+
throw std::runtime_error("no logits");
|
15517
|
+
}
|
15518
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15519
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15520
|
+
}
|
15521
|
+
const int32_t j = ctx->output_ids[i];
|
15522
|
+
|
15523
|
+
if (j < 0) {
|
15524
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
|
+
}
|
15526
|
+
if ((size_t) j >= ctx->output_size) {
|
15527
|
+
// This should not happen
|
15528
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15529
|
+
}
|
15530
|
+
|
15531
|
+
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
15532
|
+
} catch (const std::exception & err) {
|
15533
|
+
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
15534
|
+
#ifndef NDEBUG
|
15535
|
+
GGML_ASSERT(false);
|
15536
|
+
#endif
|
15537
|
+
return nullptr;
|
15538
|
+
}
|
14304
15539
|
}
|
14305
15540
|
|
14306
15541
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
@@ -14312,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
14312
15547
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
14313
15548
|
llama_synchronize(ctx);
|
14314
15549
|
|
14315
|
-
|
15550
|
+
try {
|
15551
|
+
if (ctx->embd == nullptr) {
|
15552
|
+
throw std::runtime_error("no embeddings");
|
15553
|
+
}
|
15554
|
+
if ((size_t) i >= ctx->output_ids.size()) {
|
15555
|
+
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
15556
|
+
}
|
15557
|
+
const int32_t j = ctx->output_ids[i];
|
15558
|
+
|
15559
|
+
if (j < 0) {
|
15560
|
+
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
|
+
}
|
15562
|
+
if ((size_t) j >= ctx->output_size) {
|
15563
|
+
// This should not happen
|
15564
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
15565
|
+
}
|
15566
|
+
|
15567
|
+
return ctx->embd + j*ctx->model.hparams.n_embd;
|
15568
|
+
} catch (const std::exception & err) {
|
15569
|
+
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
15570
|
+
#ifndef NDEBUG
|
15571
|
+
GGML_ASSERT(false);
|
15572
|
+
#endif
|
15573
|
+
return nullptr;
|
15574
|
+
}
|
14316
15575
|
}
|
14317
15576
|
|
14318
15577
|
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
@@ -14602,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
|
|
14602
15861
|
ss << message->content << "</s>";
|
14603
15862
|
}
|
14604
15863
|
}
|
15864
|
+
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
|
15865
|
+
// openchat/openchat-3.5-0106,
|
15866
|
+
for (auto message : chat) {
|
15867
|
+
std::string role(message->role);
|
15868
|
+
if (role == "system") {
|
15869
|
+
ss << message->content << "<|end_of_turn|>";
|
15870
|
+
} else {
|
15871
|
+
role[0] = toupper(role[0]);
|
15872
|
+
ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
|
15873
|
+
}
|
15874
|
+
}
|
15875
|
+
if (add_ass) {
|
15876
|
+
ss << "GPT4 Correct Assistant:";
|
15877
|
+
}
|
15878
|
+
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
|
15879
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
15880
|
+
for (auto message : chat) {
|
15881
|
+
std::string role(message->role);
|
15882
|
+
if (role == "system") {
|
15883
|
+
// Orca-Vicuna variant uses a system prefix
|
15884
|
+
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
|
15885
|
+
ss << "SYSTEM: " << message->content << "\n";
|
15886
|
+
} else {
|
15887
|
+
ss << message->content << "\n\n";
|
15888
|
+
}
|
15889
|
+
} else if (role == "user") {
|
15890
|
+
ss << "USER: " << message->content << "\n";
|
15891
|
+
} else if (role == "assistant") {
|
15892
|
+
ss << "ASSISTANT: " << message->content << "</s>\n";
|
15893
|
+
}
|
15894
|
+
}
|
15895
|
+
if (add_ass) {
|
15896
|
+
ss << "ASSISTANT:";
|
15897
|
+
}
|
15898
|
+
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
|
15899
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
15900
|
+
for (auto message : chat) {
|
15901
|
+
std::string role(message->role);
|
15902
|
+
if (role == "system") {
|
15903
|
+
ss << message->content;
|
15904
|
+
} else if (role == "user") {
|
15905
|
+
ss << "### Instruction:\n" << message->content << "\n";
|
15906
|
+
} else if (role == "assistant") {
|
15907
|
+
ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
|
15908
|
+
}
|
15909
|
+
}
|
15910
|
+
if (add_ass) {
|
15911
|
+
ss << "### Response:\n";
|
15912
|
+
}
|
14605
15913
|
} else {
|
14606
15914
|
// template not supported
|
14607
15915
|
return -1;
|
@@ -14651,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
14651
15959
|
return res;
|
14652
15960
|
}
|
14653
15961
|
|
15962
|
+
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
15963
|
+
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
15964
|
+
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
15965
|
+
return strlen(split_path);
|
15966
|
+
}
|
15967
|
+
return 0;
|
15968
|
+
}
|
15969
|
+
|
15970
|
+
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
15971
|
+
std::string str_split_path(split_path);
|
15972
|
+
char postfix[32];
|
15973
|
+
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
15974
|
+
std::string str_postfix(postfix);
|
15975
|
+
|
15976
|
+
// check if dest ends with postfix
|
15977
|
+
int size_prefix = str_split_path.size() - str_postfix.size();
|
15978
|
+
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
15979
|
+
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
15980
|
+
return size_prefix;
|
15981
|
+
}
|
15982
|
+
|
15983
|
+
return 0;
|
15984
|
+
}
|
15985
|
+
|
14654
15986
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
14655
15987
|
struct llama_timings result = {
|
14656
15988
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|