llama_cpp 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
@@ -11,6 +11,12 @@
|
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
14
|
+
#elif defined(GGML_USE_VULKAN)
|
15
|
+
# include "ggml-vulkan.h"
|
16
|
+
#elif defined(GGML_USE_SYCL)
|
17
|
+
# include "ggml-sycl.h"
|
18
|
+
#elif defined(GGML_USE_KOMPUTE)
|
19
|
+
# include "ggml-kompute.h"
|
14
20
|
#endif
|
15
21
|
|
16
22
|
#ifdef GGML_USE_METAL
|
@@ -52,6 +58,7 @@
|
|
52
58
|
#include <algorithm>
|
53
59
|
#include <array>
|
54
60
|
#include <cassert>
|
61
|
+
#include <cfloat>
|
55
62
|
#include <cinttypes>
|
56
63
|
#include <climits>
|
57
64
|
#include <cmath>
|
@@ -192,8 +199,12 @@ enum llm_arch {
|
|
192
199
|
LLM_ARCH_BLOOM,
|
193
200
|
LLM_ARCH_STABLELM,
|
194
201
|
LLM_ARCH_QWEN,
|
202
|
+
LLM_ARCH_QWEN2,
|
195
203
|
LLM_ARCH_PHI2,
|
196
204
|
LLM_ARCH_PLAMO,
|
205
|
+
LLM_ARCH_CODESHELL,
|
206
|
+
LLM_ARCH_ORION,
|
207
|
+
LLM_ARCH_INTERNLM2,
|
197
208
|
LLM_ARCH_UNKNOWN,
|
198
209
|
};
|
199
210
|
|
@@ -211,8 +222,12 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
211
222
|
{ LLM_ARCH_BLOOM, "bloom" },
|
212
223
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
213
224
|
{ LLM_ARCH_QWEN, "qwen" },
|
225
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
214
226
|
{ LLM_ARCH_PHI2, "phi2" },
|
215
227
|
{ LLM_ARCH_PLAMO, "plamo" },
|
228
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
229
|
+
{ LLM_ARCH_ORION, "orion" },
|
230
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
216
231
|
};
|
217
232
|
|
218
233
|
enum llm_kv {
|
@@ -265,6 +280,7 @@ enum llm_kv {
|
|
265
280
|
LLM_KV_TOKENIZER_PAD_ID,
|
266
281
|
LLM_KV_TOKENIZER_ADD_BOS,
|
267
282
|
LLM_KV_TOKENIZER_ADD_EOS,
|
283
|
+
LLM_KV_TOKENIZER_ADD_PREFIX,
|
268
284
|
LLM_KV_TOKENIZER_HF_JSON,
|
269
285
|
LLM_KV_TOKENIZER_RWKV,
|
270
286
|
};
|
@@ -319,6 +335,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
319
335
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
320
336
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
321
337
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
338
|
+
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
322
339
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
323
340
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
324
341
|
};
|
@@ -566,6 +583,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
566
583
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
567
584
|
},
|
568
585
|
},
|
586
|
+
{
|
587
|
+
LLM_ARCH_QWEN2,
|
588
|
+
{
|
589
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
590
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
591
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
592
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
593
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
594
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
595
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
596
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
597
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
598
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
599
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
600
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
601
|
+
},
|
602
|
+
},
|
569
603
|
{
|
570
604
|
LLM_ARCH_PHI2,
|
571
605
|
{
|
@@ -600,7 +634,62 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
600
634
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
601
635
|
},
|
602
636
|
},
|
603
|
-
|
637
|
+
{
|
638
|
+
LLM_ARCH_CODESHELL,
|
639
|
+
{
|
640
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
641
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
642
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
643
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
644
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
645
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
646
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
647
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
648
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
649
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
650
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
651
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
652
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
653
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
654
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
655
|
+
},
|
656
|
+
},
|
657
|
+
{
|
658
|
+
LLM_ARCH_ORION,
|
659
|
+
{
|
660
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
661
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
662
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
663
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
664
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
665
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
666
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
667
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
668
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
669
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
670
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
671
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
672
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
673
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
674
|
+
},
|
675
|
+
},
|
676
|
+
{
|
677
|
+
LLM_ARCH_INTERNLM2,
|
678
|
+
{
|
679
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
680
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
681
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
682
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
683
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
684
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
685
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
686
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
687
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
688
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
689
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
690
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
691
|
+
},
|
692
|
+
},
|
604
693
|
{
|
605
694
|
LLM_ARCH_UNKNOWN,
|
606
695
|
{
|
@@ -1091,10 +1180,10 @@ struct llama_mlock {
|
|
1091
1180
|
#ifdef __APPLE__
|
1092
1181
|
#define MLOCK_SUGGESTION \
|
1093
1182
|
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
1094
|
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing
|
1183
|
+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
|
1095
1184
|
#else
|
1096
1185
|
#define MLOCK_SUGGESTION \
|
1097
|
-
"Try increasing
|
1186
|
+
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
|
1098
1187
|
#endif
|
1099
1188
|
|
1100
1189
|
bool raw_lock(const void * addr, size_t size) const {
|
@@ -1215,8 +1304,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1215
1304
|
if (host_buffer) {
|
1216
1305
|
buft = ggml_backend_cuda_host_buffer_type();
|
1217
1306
|
}
|
1307
|
+
#elif defined(GGML_USE_SYCL)
|
1308
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
1218
1309
|
#elif defined(GGML_USE_CPU_HBM)
|
1219
1310
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1311
|
+
#elif defined(GGML_USE_VULKAN)
|
1312
|
+
if (host_buffer) {
|
1313
|
+
buft = ggml_backend_vk_host_buffer_type();
|
1314
|
+
}
|
1220
1315
|
#endif
|
1221
1316
|
|
1222
1317
|
if (buft == nullptr) {
|
@@ -1234,8 +1329,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1234
1329
|
buft = ggml_backend_metal_buffer_type();
|
1235
1330
|
#elif defined(GGML_USE_CUBLAS)
|
1236
1331
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1332
|
+
#elif defined(GGML_USE_VULKAN)
|
1333
|
+
buft = ggml_backend_vk_buffer_type();
|
1334
|
+
#elif defined(GGML_USE_SYCL)
|
1335
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
1237
1336
|
#elif defined(GGML_USE_CLBLAST)
|
1238
1337
|
buft = ggml_backend_opencl_buffer_type();
|
1338
|
+
#elif defined(GGML_USE_KOMPUTE)
|
1339
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
1340
|
+
if (buft == nullptr) {
|
1341
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1342
|
+
}
|
1239
1343
|
#endif
|
1240
1344
|
|
1241
1345
|
if (buft == nullptr) {
|
@@ -1284,12 +1388,16 @@ static llama_state g_state;
|
|
1284
1388
|
// available llama models
|
1285
1389
|
enum e_model {
|
1286
1390
|
MODEL_UNKNOWN,
|
1391
|
+
MODEL_0_5B,
|
1287
1392
|
MODEL_1B,
|
1288
1393
|
MODEL_3B,
|
1394
|
+
MODEL_4B,
|
1289
1395
|
MODEL_7B,
|
1290
1396
|
MODEL_8B,
|
1291
1397
|
MODEL_13B,
|
1398
|
+
MODEL_14B,
|
1292
1399
|
MODEL_15B,
|
1400
|
+
MODEL_20B,
|
1293
1401
|
MODEL_30B,
|
1294
1402
|
MODEL_34B,
|
1295
1403
|
MODEL_40B,
|
@@ -1393,6 +1501,9 @@ struct llama_cparams {
|
|
1393
1501
|
|
1394
1502
|
bool mul_mat_q;
|
1395
1503
|
bool offload_kqv;
|
1504
|
+
|
1505
|
+
ggml_backend_sched_eval_callback cb_eval;
|
1506
|
+
void * cb_eval_user_data;
|
1396
1507
|
};
|
1397
1508
|
|
1398
1509
|
struct llama_layer {
|
@@ -1528,6 +1639,8 @@ struct llama_vocab {
|
|
1528
1639
|
id special_suffix_id = 32008;
|
1529
1640
|
id special_eot_id = 32010;
|
1530
1641
|
|
1642
|
+
bool add_space_prefix = true;
|
1643
|
+
|
1531
1644
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
1532
1645
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
1533
1646
|
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
@@ -1596,7 +1709,7 @@ struct llama_model {
|
|
1596
1709
|
std::unique_ptr<llama_mmap> mapping;
|
1597
1710
|
|
1598
1711
|
// objects representing data potentially being locked in memory
|
1599
|
-
llama_mlock
|
1712
|
+
std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
|
1600
1713
|
llama_mlock mlock_mmap;
|
1601
1714
|
|
1602
1715
|
// for quantize-stats only
|
@@ -1623,6 +1736,9 @@ struct llama_context {
|
|
1623
1736
|
for (ggml_backend_t backend : backends) {
|
1624
1737
|
ggml_backend_free(backend);
|
1625
1738
|
}
|
1739
|
+
|
1740
|
+
ggml_backend_buffer_free(buf_input);
|
1741
|
+
ggml_free(ctx_input);
|
1626
1742
|
}
|
1627
1743
|
|
1628
1744
|
llama_cparams cparams;
|
@@ -1669,8 +1785,14 @@ struct llama_context {
|
|
1669
1785
|
// allocator for the input tensors
|
1670
1786
|
ggml_tallocr * alloc = nullptr;
|
1671
1787
|
|
1672
|
-
//
|
1673
|
-
|
1788
|
+
// input tensors
|
1789
|
+
ggml_backend_buffer_t buf_input = nullptr;
|
1790
|
+
ggml_context * ctx_input = nullptr;
|
1791
|
+
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
1792
|
+
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
1793
|
+
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1794
|
+
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
1795
|
+
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
1674
1796
|
|
1675
1797
|
#ifdef GGML_USE_MPI
|
1676
1798
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2254,20 +2376,21 @@ struct llama_model_loader {
|
|
2254
2376
|
}
|
2255
2377
|
|
2256
2378
|
switch (type_max) {
|
2257
|
-
case GGML_TYPE_F32:
|
2258
|
-
case GGML_TYPE_F16:
|
2259
|
-
case GGML_TYPE_Q4_0:
|
2260
|
-
case GGML_TYPE_Q4_1:
|
2261
|
-
case GGML_TYPE_Q5_0:
|
2262
|
-
case GGML_TYPE_Q5_1:
|
2263
|
-
case GGML_TYPE_Q8_0:
|
2264
|
-
case GGML_TYPE_Q2_K:
|
2265
|
-
case GGML_TYPE_Q3_K:
|
2266
|
-
case GGML_TYPE_Q4_K:
|
2267
|
-
case GGML_TYPE_Q5_K:
|
2268
|
-
case GGML_TYPE_Q6_K:
|
2379
|
+
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
2380
|
+
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
2381
|
+
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
2382
|
+
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
2383
|
+
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
2384
|
+
case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
|
2385
|
+
case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
|
2386
|
+
case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
|
2387
|
+
case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
|
2388
|
+
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
2389
|
+
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
2390
|
+
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2269
2391
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2270
2392
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2393
|
+
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2271
2394
|
default:
|
2272
2395
|
{
|
2273
2396
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2613,8 +2736,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2613
2736
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2614
2737
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2615
2738
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2616
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "
|
2739
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
2617
2740
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2741
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
2742
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2618
2743
|
|
2619
2744
|
default: return "unknown, may not work";
|
2620
2745
|
}
|
@@ -2627,7 +2752,9 @@ static const char * llama_model_type_name(e_model type) {
|
|
2627
2752
|
case MODEL_7B: return "7B";
|
2628
2753
|
case MODEL_8B: return "8B";
|
2629
2754
|
case MODEL_13B: return "13B";
|
2755
|
+
case MODEL_14B: return "14B";
|
2630
2756
|
case MODEL_15B: return "15B";
|
2757
|
+
case MODEL_20B: return "20B";
|
2631
2758
|
case MODEL_30B: return "30B";
|
2632
2759
|
case MODEL_34B: return "34B";
|
2633
2760
|
case MODEL_40B: return "40B";
|
@@ -2640,6 +2767,14 @@ static const char * llama_model_type_name(e_model type) {
|
|
2640
2767
|
default: return "?B";
|
2641
2768
|
}
|
2642
2769
|
}
|
2770
|
+
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2771
|
+
switch (type) {
|
2772
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2773
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2774
|
+
default: return "unknown";
|
2775
|
+
}
|
2776
|
+
}
|
2777
|
+
|
2643
2778
|
|
2644
2779
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
2645
2780
|
model.arch = ml.get_arch();
|
@@ -2830,6 +2965,7 @@ static void llm_load_hparams(
|
|
2830
2965
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2831
2966
|
|
2832
2967
|
switch (hparams.n_layer) {
|
2968
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
2833
2969
|
case 32: model.type = e_model::MODEL_3B; break;
|
2834
2970
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2835
2971
|
}
|
@@ -2844,6 +2980,17 @@ static void llm_load_hparams(
|
|
2844
2980
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2845
2981
|
}
|
2846
2982
|
} break;
|
2983
|
+
case LLM_ARCH_QWEN2:
|
2984
|
+
{
|
2985
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2986
|
+
switch (hparams.n_layer) {
|
2987
|
+
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
2988
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2989
|
+
case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
2990
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
2991
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2992
|
+
}
|
2993
|
+
} break;
|
2847
2994
|
case LLM_ARCH_PHI2:
|
2848
2995
|
{
|
2849
2996
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -2874,7 +3021,32 @@ static void llm_load_hparams(
|
|
2874
3021
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2875
3022
|
}
|
2876
3023
|
} break;
|
3024
|
+
case LLM_ARCH_CODESHELL:
|
3025
|
+
{
|
3026
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3027
|
+
switch (hparams.n_layer) {
|
3028
|
+
case 42: model.type = e_model::MODEL_SMALL; break;
|
3029
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3030
|
+
}
|
3031
|
+
} break;
|
3032
|
+
case LLM_ARCH_ORION:
|
3033
|
+
{
|
3034
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2877
3035
|
|
3036
|
+
switch (hparams.n_layer) {
|
3037
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
3038
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3039
|
+
}
|
3040
|
+
} break;
|
3041
|
+
case LLM_ARCH_INTERNLM2:
|
3042
|
+
{
|
3043
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3044
|
+
switch (hparams.n_layer) {
|
3045
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3046
|
+
case 48: model.type = e_model::MODEL_20B; break;
|
3047
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3048
|
+
}
|
3049
|
+
} break;
|
2878
3050
|
default: (void)0;
|
2879
3051
|
}
|
2880
3052
|
|
@@ -2926,6 +3098,11 @@ static void llm_load_vocab(
|
|
2926
3098
|
vocab.special_unk_id = 0;
|
2927
3099
|
vocab.special_sep_id = -1;
|
2928
3100
|
vocab.special_pad_id = -1;
|
3101
|
+
|
3102
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
3103
|
+
if (add_space_prefix_keyidx != -1) {
|
3104
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
3105
|
+
} // The default value of add_space_prefix is true.
|
2929
3106
|
} else if (tokenizer_name == "gpt2") {
|
2930
3107
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
2931
3108
|
|
@@ -3138,7 +3315,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3138
3315
|
// hparams
|
3139
3316
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
3140
3317
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
3141
|
-
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type
|
3318
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
3142
3319
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
3143
3320
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
3144
3321
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
@@ -3435,7 +3612,12 @@ static bool llm_load_tensors(
|
|
3435
3612
|
{
|
3436
3613
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3437
3614
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3438
|
-
|
3615
|
+
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
|
3616
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3617
|
+
} else {
|
3618
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
3619
|
+
ml.n_created--; // artificial tensor
|
3620
|
+
}
|
3439
3621
|
}
|
3440
3622
|
|
3441
3623
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -3629,6 +3811,11 @@ static bool llm_load_tensors(
|
|
3629
3811
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3630
3812
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3631
3813
|
|
3814
|
+
// optional bias tensors, present in Stable LM 2 1.6B
|
3815
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
|
3816
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
3817
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
3818
|
+
|
3632
3819
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3633
3820
|
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3634
3821
|
|
@@ -3666,6 +3853,41 @@ static bool llm_load_tensors(
|
|
3666
3853
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
|
3667
3854
|
}
|
3668
3855
|
} break;
|
3856
|
+
case LLM_ARCH_QWEN2:
|
3857
|
+
{
|
3858
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3859
|
+
|
3860
|
+
// output
|
3861
|
+
{
|
3862
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3863
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3864
|
+
}
|
3865
|
+
|
3866
|
+
for (int i = 0; i < n_layer; ++i) {
|
3867
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3868
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3869
|
+
|
3870
|
+
auto & layer = model.layers[i];
|
3871
|
+
|
3872
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3873
|
+
|
3874
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3875
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3876
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3877
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3878
|
+
|
3879
|
+
// optional bias tensors
|
3880
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3881
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3882
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3883
|
+
|
3884
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3885
|
+
|
3886
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
3887
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
3888
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3889
|
+
}
|
3890
|
+
} break;
|
3669
3891
|
case LLM_ARCH_PHI2:
|
3670
3892
|
{
|
3671
3893
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -3776,6 +3998,101 @@ static bool llm_load_tensors(
|
|
3776
3998
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3777
3999
|
}
|
3778
4000
|
} break;
|
4001
|
+
case LLM_ARCH_CODESHELL:
|
4002
|
+
{
|
4003
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4004
|
+
|
4005
|
+
// output
|
4006
|
+
{
|
4007
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4008
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4009
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4010
|
+
}
|
4011
|
+
|
4012
|
+
for (int i = 0; i < n_layer; ++i) {
|
4013
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4014
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4015
|
+
|
4016
|
+
auto & layer = model.layers[i];
|
4017
|
+
|
4018
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4019
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4020
|
+
|
4021
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4022
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
4023
|
+
|
4024
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4025
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4026
|
+
|
4027
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4028
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4029
|
+
|
4030
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
4031
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4032
|
+
|
4033
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4034
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
4035
|
+
}
|
4036
|
+
} break;
|
4037
|
+
case LLM_ARCH_ORION:
|
4038
|
+
{
|
4039
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4040
|
+
{
|
4041
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4042
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4043
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4044
|
+
}
|
4045
|
+
for (int i = 0; i < n_layer; ++i) {
|
4046
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4047
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4048
|
+
|
4049
|
+
auto & layer = model.layers[i];
|
4050
|
+
|
4051
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4052
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4053
|
+
|
4054
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4055
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4056
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4057
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4058
|
+
|
4059
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4060
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4061
|
+
|
4062
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4063
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4064
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4065
|
+
}
|
4066
|
+
} break;
|
4067
|
+
case LLM_ARCH_INTERNLM2:
|
4068
|
+
{
|
4069
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4070
|
+
|
4071
|
+
// output
|
4072
|
+
{
|
4073
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4074
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4075
|
+
}
|
4076
|
+
|
4077
|
+
for (int i = 0; i < n_layer; ++i) {
|
4078
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4079
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4080
|
+
|
4081
|
+
auto & layer = model.layers[i];
|
4082
|
+
|
4083
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4084
|
+
// layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4085
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4086
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4087
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4088
|
+
|
4089
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4090
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4091
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4092
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4093
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4094
|
+
}
|
4095
|
+
} break;
|
3779
4096
|
default:
|
3780
4097
|
throw std::runtime_error("unknown architecture");
|
3781
4098
|
}
|
@@ -3812,8 +4129,10 @@ static bool llm_load_tensors(
|
|
3812
4129
|
else {
|
3813
4130
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
3814
4131
|
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
|
3815
|
-
model.
|
3816
|
-
model.
|
4132
|
+
model.mlock_bufs.emplace_back(new llama_mlock);
|
4133
|
+
auto & mlock_buf = model.mlock_bufs.back();
|
4134
|
+
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
4135
|
+
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
3817
4136
|
}
|
3818
4137
|
}
|
3819
4138
|
if (buf == nullptr) {
|
@@ -3870,7 +4189,7 @@ static bool llm_load_tensors(
|
|
3870
4189
|
}
|
3871
4190
|
|
3872
4191
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
3873
|
-
static int llama_model_load(const std::string & fname, llama_model & model,
|
4192
|
+
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
3874
4193
|
try {
|
3875
4194
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3876
4195
|
|
@@ -3891,6 +4210,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|
3891
4210
|
return 0;
|
3892
4211
|
}
|
3893
4212
|
|
4213
|
+
#ifdef GGML_USE_KOMPUTE
|
4214
|
+
if (params.n_gpu_layers > 0 && (
|
4215
|
+
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|
4216
|
+
|| !(
|
4217
|
+
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
4218
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
4219
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
4220
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
4221
|
+
)
|
4222
|
+
)) {
|
4223
|
+
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
|
4224
|
+
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
|
4225
|
+
params.n_gpu_layers = 0;
|
4226
|
+
}
|
4227
|
+
#endif
|
4228
|
+
|
3894
4229
|
if (!llm_load_tensors(
|
3895
4230
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
3896
4231
|
params.progress_callback, params.progress_callback_user_data
|
@@ -3939,22 +4274,24 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
3939
4274
|
const llama_hparams & hparams,
|
3940
4275
|
const llama_batch & batch,
|
3941
4276
|
struct ggml_tensor * tok_embd,
|
4277
|
+
struct ggml_tensor * inp_tokens,
|
4278
|
+
struct ggml_tensor * inp_embd,
|
3942
4279
|
const llm_build_cb & cb) {
|
3943
4280
|
const int64_t n_embd = hparams.n_embd;
|
3944
4281
|
|
3945
4282
|
struct ggml_tensor * inpL;
|
3946
4283
|
|
3947
4284
|
if (batch.token) {
|
3948
|
-
struct ggml_tensor *
|
4285
|
+
struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
|
3949
4286
|
cb(inp_tokens, "inp_tokens", -1);
|
3950
4287
|
|
3951
|
-
inpL = ggml_get_rows(ctx, tok_embd,
|
4288
|
+
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
|
3952
4289
|
} else {
|
3953
4290
|
#ifdef GGML_USE_MPI
|
3954
4291
|
GGML_ASSERT(false && "not implemented");
|
3955
4292
|
#endif
|
3956
4293
|
|
3957
|
-
inpL =
|
4294
|
+
inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
|
3958
4295
|
}
|
3959
4296
|
|
3960
4297
|
return inpL;
|
@@ -3968,6 +4305,7 @@ static void llm_build_k_shift(
|
|
3968
4305
|
const llama_cparams & cparams,
|
3969
4306
|
const llama_kv_cache & kv,
|
3970
4307
|
struct ggml_cgraph * graph,
|
4308
|
+
struct ggml_tensor * K_shift,
|
3971
4309
|
llm_rope_type type,
|
3972
4310
|
int64_t n_ctx,
|
3973
4311
|
float freq_base,
|
@@ -3984,9 +4322,6 @@ static void llm_build_k_shift(
|
|
3984
4322
|
const float beta_fast = cparams.yarn_beta_fast;
|
3985
4323
|
const float beta_slow = cparams.yarn_beta_slow;
|
3986
4324
|
|
3987
|
-
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
3988
|
-
cb(K_shift, "K_shift", -1);
|
3989
|
-
|
3990
4325
|
int rope_type = 0;
|
3991
4326
|
|
3992
4327
|
switch (type) {
|
@@ -4174,6 +4509,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4174
4509
|
const llama_model & model,
|
4175
4510
|
const llama_hparams & hparams,
|
4176
4511
|
const llama_kv_cache & kv,
|
4512
|
+
struct ggml_cgraph * graph,
|
4177
4513
|
struct ggml_tensor * wo,
|
4178
4514
|
struct ggml_tensor * wo_b,
|
4179
4515
|
struct ggml_tensor * q_cur,
|
@@ -4252,6 +4588,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4252
4588
|
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
4253
4589
|
cb(cur, "kqv_merged_cont", il);
|
4254
4590
|
|
4591
|
+
ggml_build_forward_expand(graph, cur);
|
4592
|
+
|
4255
4593
|
cur = ggml_mul_mat(ctx, wo, cur);
|
4256
4594
|
if (wo_b) {
|
4257
4595
|
cb(cur, "kqv_wo", il);
|
@@ -4264,17 +4602,56 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4264
4602
|
return cur;
|
4265
4603
|
}
|
4266
4604
|
|
4267
|
-
struct
|
4268
|
-
|
4269
|
-
|
4270
|
-
|
4271
|
-
|
4272
|
-
|
4273
|
-
|
4274
|
-
|
4275
|
-
|
4276
|
-
|
4277
|
-
|
4605
|
+
static struct ggml_tensor * llm_build_kv(
|
4606
|
+
struct ggml_context * ctx,
|
4607
|
+
const llama_model & model,
|
4608
|
+
const llama_hparams & hparams,
|
4609
|
+
const llama_kv_cache & kv,
|
4610
|
+
struct ggml_cgraph * graph,
|
4611
|
+
struct ggml_tensor * wo,
|
4612
|
+
struct ggml_tensor * wo_b,
|
4613
|
+
struct ggml_tensor * k_cur,
|
4614
|
+
struct ggml_tensor * v_cur,
|
4615
|
+
struct ggml_tensor * q_cur,
|
4616
|
+
struct ggml_tensor * kq_mask,
|
4617
|
+
int64_t n_ctx,
|
4618
|
+
int32_t n_tokens,
|
4619
|
+
int32_t kv_head,
|
4620
|
+
int32_t n_kv,
|
4621
|
+
float max_alibi_bias,
|
4622
|
+
float kq_scale,
|
4623
|
+
const llm_build_cb & cb,
|
4624
|
+
int il) {
|
4625
|
+
|
4626
|
+
// these nodes are added to the graph together so that they are not reordered
|
4627
|
+
// by doing so, the number of splits in the graph is reduced
|
4628
|
+
ggml_build_forward_expand(graph, q_cur);
|
4629
|
+
ggml_build_forward_expand(graph, k_cur);
|
4630
|
+
ggml_build_forward_expand(graph, v_cur);
|
4631
|
+
|
4632
|
+
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4633
|
+
|
4634
|
+
struct ggml_tensor * cur;
|
4635
|
+
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
4636
|
+
wo, wo_b,
|
4637
|
+
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
4638
|
+
cb(cur, "kqv_out", il);
|
4639
|
+
|
4640
|
+
return cur;
|
4641
|
+
}
|
4642
|
+
|
4643
|
+
struct llm_build_context {
|
4644
|
+
const llama_model & model;
|
4645
|
+
const llama_context & lctx;
|
4646
|
+
const llama_hparams & hparams;
|
4647
|
+
const llama_cparams & cparams;
|
4648
|
+
const llama_batch & batch;
|
4649
|
+
const llama_kv_cache & kv_self;
|
4650
|
+
|
4651
|
+
const int64_t n_embd;
|
4652
|
+
const int64_t n_layer;
|
4653
|
+
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
4654
|
+
const int64_t n_head;
|
4278
4655
|
const int64_t n_head_kv;
|
4279
4656
|
const int64_t n_embd_head_k;
|
4280
4657
|
const int64_t n_embd_k_gqa;
|
@@ -4312,6 +4689,7 @@ struct llm_build_context {
|
|
4312
4689
|
const llm_build_cb & cb,
|
4313
4690
|
bool worst_case) :
|
4314
4691
|
model (lctx.model),
|
4692
|
+
lctx (lctx),
|
4315
4693
|
hparams (model.hparams),
|
4316
4694
|
cparams (lctx.cparams),
|
4317
4695
|
batch (batch),
|
@@ -4372,20 +4750,20 @@ struct llm_build_context {
|
|
4372
4750
|
struct ggml_tensor * cur;
|
4373
4751
|
struct ggml_tensor * inpL;
|
4374
4752
|
|
4375
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4753
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
4376
4754
|
cb(inpL, "inp_embd", -1);
|
4377
4755
|
|
4378
4756
|
// inp_pos - contains the positions
|
4379
|
-
struct ggml_tensor * inp_pos =
|
4757
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
4380
4758
|
cb(inp_pos, "inp_pos", -1);
|
4381
4759
|
|
4382
4760
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4383
|
-
struct ggml_tensor * KQ_mask =
|
4761
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
4384
4762
|
cb(KQ_mask, "KQ_mask", -1);
|
4385
4763
|
|
4386
4764
|
// shift the entire K-cache if needed
|
4387
4765
|
if (do_rope_shift) {
|
4388
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
4766
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
4389
4767
|
}
|
4390
4768
|
|
4391
4769
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -4421,12 +4799,6 @@ struct llm_build_context {
|
|
4421
4799
|
cb(Vcur, "Vcur", il);
|
4422
4800
|
}
|
4423
4801
|
|
4424
|
-
// these nodes are added to the graph together so that they are not reordered
|
4425
|
-
// by doing so, the number of splits in the graph is reduced
|
4426
|
-
ggml_build_forward_expand(gf, Qcur);
|
4427
|
-
ggml_build_forward_expand(gf, Kcur);
|
4428
|
-
ggml_build_forward_expand(gf, Vcur);
|
4429
|
-
|
4430
4802
|
Qcur = ggml_rope_custom(
|
4431
4803
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4432
4804
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -4441,11 +4813,9 @@ struct llm_build_context {
|
|
4441
4813
|
);
|
4442
4814
|
cb(Kcur, "Kcur", il);
|
4443
4815
|
|
4444
|
-
|
4445
|
-
|
4446
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
4816
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
4447
4817
|
model.layers[il].wo, model.layers[il].bo,
|
4448
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4818
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4449
4819
|
cb(cur, "kqv_out", il);
|
4450
4820
|
}
|
4451
4821
|
|
@@ -4564,20 +4934,20 @@ struct llm_build_context {
|
|
4564
4934
|
struct ggml_tensor * cur;
|
4565
4935
|
struct ggml_tensor * inpL;
|
4566
4936
|
|
4567
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4937
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
4568
4938
|
cb(inpL, "inp_embd", -1);
|
4569
4939
|
|
4570
4940
|
// inp_pos - contains the positions
|
4571
|
-
struct ggml_tensor * inp_pos =
|
4941
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
4572
4942
|
cb(inp_pos, "inp_pos", -1);
|
4573
4943
|
|
4574
4944
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4575
|
-
struct ggml_tensor * KQ_mask =
|
4945
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
4576
4946
|
cb(KQ_mask, "KQ_mask", -1);
|
4577
4947
|
|
4578
4948
|
// shift the entire K-cache if needed
|
4579
4949
|
if (do_rope_shift) {
|
4580
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
4950
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
4581
4951
|
}
|
4582
4952
|
|
4583
4953
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -4622,14 +4992,13 @@ struct llm_build_context {
|
|
4622
4992
|
cb(Qcur, "Qcur", il);
|
4623
4993
|
cb(Kcur, "Kcur", il);
|
4624
4994
|
|
4625
|
-
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4626
4995
|
|
4627
4996
|
// apply ALiBi for 13B model
|
4628
4997
|
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
4629
4998
|
|
4630
|
-
cur =
|
4999
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
4631
5000
|
model.layers[il].wo, NULL,
|
4632
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5001
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4633
5002
|
cb(cur, "kqv_out", il);
|
4634
5003
|
}
|
4635
5004
|
|
@@ -4686,20 +5055,20 @@ struct llm_build_context {
|
|
4686
5055
|
struct ggml_tensor * cur;
|
4687
5056
|
struct ggml_tensor * inpL;
|
4688
5057
|
|
4689
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5058
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
4690
5059
|
cb(inpL, "inp_embd", -1);
|
4691
5060
|
|
4692
5061
|
// inp_pos - contains the positions
|
4693
|
-
struct ggml_tensor * inp_pos =
|
5062
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
4694
5063
|
cb(inp_pos, "inp_pos", -1);
|
4695
5064
|
|
4696
5065
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4697
|
-
struct ggml_tensor * KQ_mask =
|
5066
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
4698
5067
|
cb(KQ_mask, "KQ_mask", -1);
|
4699
5068
|
|
4700
5069
|
// shift the entire K-cache if needed
|
4701
5070
|
if (do_rope_shift) {
|
4702
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5071
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
4703
5072
|
}
|
4704
5073
|
|
4705
5074
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -4751,11 +5120,9 @@ struct llm_build_context {
|
|
4751
5120
|
);
|
4752
5121
|
cb(Kcur, "Kcur", il);
|
4753
5122
|
|
4754
|
-
|
4755
|
-
|
4756
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5123
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
4757
5124
|
model.layers[il].wo, NULL,
|
4758
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5125
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4759
5126
|
cb(cur, "kqv_out", il);
|
4760
5127
|
}
|
4761
5128
|
|
@@ -4810,15 +5177,15 @@ struct llm_build_context {
|
|
4810
5177
|
struct ggml_tensor * pos;
|
4811
5178
|
struct ggml_tensor * inpL;
|
4812
5179
|
|
4813
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5180
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
4814
5181
|
cb(inpL, "inp_embd", -1);
|
4815
5182
|
|
4816
5183
|
// inp_pos - contains the positions
|
4817
|
-
struct ggml_tensor * inp_pos =
|
5184
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
4818
5185
|
cb(inp_pos, "inp_pos", -1);
|
4819
5186
|
|
4820
5187
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4821
|
-
struct ggml_tensor * KQ_mask =
|
5188
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
4822
5189
|
cb(KQ_mask, "KQ_mask", -1);
|
4823
5190
|
|
4824
5191
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
@@ -4852,11 +5219,9 @@ struct llm_build_context {
|
|
4852
5219
|
|
4853
5220
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
4854
5221
|
|
4855
|
-
|
4856
|
-
|
4857
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5222
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
4858
5223
|
model.layers[il].wo, model.layers[il].bo,
|
4859
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5224
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
4860
5225
|
cb(cur, "kqv_out", il);
|
4861
5226
|
}
|
4862
5227
|
|
@@ -4909,19 +5274,19 @@ struct llm_build_context {
|
|
4909
5274
|
struct ggml_tensor * cur;
|
4910
5275
|
struct ggml_tensor * inpL;
|
4911
5276
|
|
4912
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5277
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
4913
5278
|
cb(inpL, "inp_embd", -1);
|
4914
5279
|
|
4915
5280
|
// inp_pos - contains the positions
|
4916
|
-
struct ggml_tensor * inp_pos =
|
5281
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
4917
5282
|
cb(inp_pos, "inp_pos", -1);
|
4918
5283
|
|
4919
5284
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4920
|
-
struct ggml_tensor * KQ_mask =
|
5285
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
4921
5286
|
cb(KQ_mask, "KQ_mask", -1);
|
4922
5287
|
|
4923
5288
|
if (do_rope_shift) {
|
4924
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5289
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
4925
5290
|
}
|
4926
5291
|
|
4927
5292
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5059,12 +5424,9 @@ struct llm_build_context {
|
|
5059
5424
|
);
|
5060
5425
|
cb(Vcur, "Vcur", il);
|
5061
5426
|
|
5062
|
-
|
5063
|
-
|
5064
|
-
// TODO: not tested, could be broken
|
5065
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5427
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5066
5428
|
model.layers[il].wo, model.layers[il].bo,
|
5067
|
-
Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5429
|
+
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5068
5430
|
cb(cur, "kqv_out", il);
|
5069
5431
|
}
|
5070
5432
|
|
@@ -5119,11 +5481,11 @@ struct llm_build_context {
|
|
5119
5481
|
struct ggml_tensor * cur;
|
5120
5482
|
struct ggml_tensor * inpL;
|
5121
5483
|
|
5122
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5484
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5123
5485
|
cb(inpL, "inp_embd", -1);
|
5124
5486
|
|
5125
5487
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5126
|
-
struct ggml_tensor * KQ_mask =
|
5488
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5127
5489
|
cb(KQ_mask, "KQ_mask", -1);
|
5128
5490
|
|
5129
5491
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5151,11 +5513,9 @@ struct llm_build_context {
|
|
5151
5513
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5152
5514
|
cb(Qcur, "Qcur", il);
|
5153
5515
|
|
5154
|
-
|
5155
|
-
|
5156
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5516
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5157
5517
|
model.layers[il].wo, NULL,
|
5158
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5518
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5159
5519
|
cb(cur, "kqv_out", il);
|
5160
5520
|
}
|
5161
5521
|
|
@@ -5211,11 +5571,11 @@ struct llm_build_context {
|
|
5211
5571
|
struct ggml_tensor * cur;
|
5212
5572
|
struct ggml_tensor * inpL;
|
5213
5573
|
|
5214
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5574
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5215
5575
|
cb(inpL, "inp_embd", -1);
|
5216
5576
|
|
5217
5577
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5218
|
-
struct ggml_tensor * KQ_mask =
|
5578
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5219
5579
|
cb(KQ_mask, "KQ_mask", -1);
|
5220
5580
|
|
5221
5581
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
@@ -5249,11 +5609,9 @@ struct llm_build_context {
|
|
5249
5609
|
|
5250
5610
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5251
5611
|
|
5252
|
-
|
5253
|
-
|
5254
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5612
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5255
5613
|
model.layers[il].wo, model.layers[il].bo,
|
5256
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5614
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5257
5615
|
cb(cur, "kqv_out", il);
|
5258
5616
|
}
|
5259
5617
|
|
@@ -5306,11 +5664,11 @@ struct llm_build_context {
|
|
5306
5664
|
struct ggml_tensor * cur;
|
5307
5665
|
struct ggml_tensor * inpL;
|
5308
5666
|
|
5309
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5667
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5310
5668
|
cb(inpL, "inp_embd", -1);
|
5311
5669
|
|
5312
5670
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5313
|
-
struct ggml_tensor * KQ_mask =
|
5671
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5314
5672
|
cb(KQ_mask, "KQ_mask", -1);
|
5315
5673
|
|
5316
5674
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5344,11 +5702,9 @@ struct llm_build_context {
|
|
5344
5702
|
|
5345
5703
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5346
5704
|
|
5347
|
-
|
5348
|
-
|
5349
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5705
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5350
5706
|
model.layers[il].wo, NULL,
|
5351
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5707
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5352
5708
|
cb(cur, "kqv_out", il);
|
5353
5709
|
}
|
5354
5710
|
|
@@ -5404,20 +5760,20 @@ struct llm_build_context {
|
|
5404
5760
|
struct ggml_tensor * cur;
|
5405
5761
|
struct ggml_tensor * inpL;
|
5406
5762
|
|
5407
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5763
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5408
5764
|
cb(inpL, "inp_embd", -1);
|
5409
5765
|
|
5410
5766
|
// inp_pos - contains the positions
|
5411
|
-
struct ggml_tensor * inp_pos =
|
5767
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5412
5768
|
cb(inp_pos, "inp_pos", -1);
|
5413
5769
|
|
5414
5770
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5415
|
-
struct ggml_tensor * KQ_mask =
|
5771
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5416
5772
|
cb(KQ_mask, "KQ_mask", -1);
|
5417
5773
|
|
5418
5774
|
// shift the entire K-cache if needed
|
5419
5775
|
if (do_rope_shift) {
|
5420
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5776
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5421
5777
|
}
|
5422
5778
|
|
5423
5779
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5435,12 +5791,24 @@ struct llm_build_context {
|
|
5435
5791
|
// compute Q and K and RoPE them
|
5436
5792
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
5437
5793
|
cb(Qcur, "Qcur", il);
|
5794
|
+
if (model.layers[il].bq) {
|
5795
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
5796
|
+
cb(Qcur, "Qcur", il);
|
5797
|
+
}
|
5438
5798
|
|
5439
5799
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
5440
5800
|
cb(Kcur, "Kcur", il);
|
5801
|
+
if (model.layers[il].bk) {
|
5802
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
5803
|
+
cb(Kcur, "Kcur", il);
|
5804
|
+
}
|
5441
5805
|
|
5442
5806
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
5443
5807
|
cb(Vcur, "Vcur", il);
|
5808
|
+
if (model.layers[il].bv) {
|
5809
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
5810
|
+
cb(Vcur, "Vcur", il);
|
5811
|
+
}
|
5444
5812
|
|
5445
5813
|
Qcur = ggml_rope_custom(
|
5446
5814
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
@@ -5456,11 +5824,9 @@ struct llm_build_context {
|
|
5456
5824
|
);
|
5457
5825
|
cb(Kcur, "Kcur", il);
|
5458
5826
|
|
5459
|
-
|
5460
|
-
|
5461
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5827
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5462
5828
|
model.layers[il].wo, NULL,
|
5463
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5829
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5464
5830
|
cb(cur, "kqv_out", il);
|
5465
5831
|
}
|
5466
5832
|
|
@@ -5517,20 +5883,20 @@ struct llm_build_context {
|
|
5517
5883
|
struct ggml_tensor * cur;
|
5518
5884
|
struct ggml_tensor * inpL;
|
5519
5885
|
|
5520
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5886
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5521
5887
|
cb(inpL, "inp_embd", -1);
|
5522
5888
|
|
5523
5889
|
// inp_pos - contains the positions
|
5524
|
-
struct ggml_tensor * inp_pos =
|
5890
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5525
5891
|
cb(inp_pos, "inp_pos", -1);
|
5526
5892
|
|
5527
5893
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5528
|
-
struct ggml_tensor * KQ_mask =
|
5894
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5529
5895
|
cb(KQ_mask, "KQ_mask", -1);
|
5530
5896
|
|
5531
5897
|
// shift the entire K-cache if needed
|
5532
5898
|
if (do_rope_shift) {
|
5533
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5899
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5534
5900
|
}
|
5535
5901
|
|
5536
5902
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5573,11 +5939,9 @@ struct llm_build_context {
|
|
5573
5939
|
);
|
5574
5940
|
cb(Kcur, "Kcur", il);
|
5575
5941
|
|
5576
|
-
|
5577
|
-
|
5578
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
5942
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5579
5943
|
model.layers[il].wo, NULL,
|
5580
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5944
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5581
5945
|
cb(cur, "kqv_out", il);
|
5582
5946
|
}
|
5583
5947
|
|
@@ -5622,6 +5986,126 @@ struct llm_build_context {
|
|
5622
5986
|
|
5623
5987
|
return gf;
|
5624
5988
|
}
|
5989
|
+
|
5990
|
+
struct ggml_cgraph * build_qwen2() {
|
5991
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5992
|
+
|
5993
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5994
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5995
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5996
|
+
|
5997
|
+
struct ggml_tensor * cur;
|
5998
|
+
struct ggml_tensor * inpL;
|
5999
|
+
|
6000
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6001
|
+
cb(inpL, "inp_embd", -1);
|
6002
|
+
|
6003
|
+
// inp_pos - contains the positions
|
6004
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6005
|
+
cb(inp_pos, "inp_pos", -1);
|
6006
|
+
|
6007
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6008
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6009
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6010
|
+
|
6011
|
+
// shift the entire K-cache if needed
|
6012
|
+
if (do_rope_shift) {
|
6013
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6014
|
+
}
|
6015
|
+
|
6016
|
+
for (int il = 0; il < n_layer; ++il) {
|
6017
|
+
struct ggml_tensor * inpSA = inpL;
|
6018
|
+
|
6019
|
+
// norm
|
6020
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6021
|
+
model.layers[il].attn_norm, NULL,
|
6022
|
+
LLM_NORM_RMS, cb, il);
|
6023
|
+
cb(cur, "attn_norm", il);
|
6024
|
+
|
6025
|
+
// self-attention
|
6026
|
+
{
|
6027
|
+
// compute Q and K and RoPE them
|
6028
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6029
|
+
cb(Qcur, "Qcur", il);
|
6030
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6031
|
+
cb(Qcur, "Qcur", il);
|
6032
|
+
|
6033
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6034
|
+
cb(Kcur, "Kcur", il);
|
6035
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6036
|
+
cb(Kcur, "Kcur", il);
|
6037
|
+
|
6038
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6039
|
+
cb(Vcur, "Vcur", il);
|
6040
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6041
|
+
cb(Vcur, "Vcur", il);
|
6042
|
+
|
6043
|
+
// these nodes are added to the graph together so that they are not reordered
|
6044
|
+
// by doing so, the number of splits in the graph is reduced
|
6045
|
+
ggml_build_forward_expand(gf, Qcur);
|
6046
|
+
ggml_build_forward_expand(gf, Kcur);
|
6047
|
+
ggml_build_forward_expand(gf, Vcur);
|
6048
|
+
|
6049
|
+
Qcur = ggml_rope_custom(
|
6050
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6051
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6052
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6053
|
+
);
|
6054
|
+
cb(Qcur, "Qcur", il);
|
6055
|
+
|
6056
|
+
Kcur = ggml_rope_custom(
|
6057
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6058
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6059
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6060
|
+
);
|
6061
|
+
cb(Kcur, "Kcur", il);
|
6062
|
+
|
6063
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6064
|
+
model.layers[il].wo, model.layers[il].bo,
|
6065
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6066
|
+
cb(cur, "kqv_out", il);
|
6067
|
+
}
|
6068
|
+
|
6069
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6070
|
+
cb(ffn_inp, "ffn_inp", il);
|
6071
|
+
|
6072
|
+
// feed-forward network
|
6073
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6074
|
+
model.layers[il].ffn_norm, NULL,
|
6075
|
+
LLM_NORM_RMS, cb, il);
|
6076
|
+
cb(cur, "ffn_norm", il);
|
6077
|
+
|
6078
|
+
cur = llm_build_ffn(ctx0, cur,
|
6079
|
+
model.layers[il].ffn_up, NULL,
|
6080
|
+
model.layers[il].ffn_gate, NULL,
|
6081
|
+
model.layers[il].ffn_down, NULL,
|
6082
|
+
NULL,
|
6083
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6084
|
+
cb(cur, "ffn_out", il);
|
6085
|
+
|
6086
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6087
|
+
cb(cur, "l_out", il);
|
6088
|
+
|
6089
|
+
// input for next layer
|
6090
|
+
inpL = cur;
|
6091
|
+
}
|
6092
|
+
|
6093
|
+
cur = inpL;
|
6094
|
+
|
6095
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6096
|
+
model.output_norm, NULL,
|
6097
|
+
LLM_NORM_RMS, cb, -1);
|
6098
|
+
cb(cur, "result_norm", -1);
|
6099
|
+
|
6100
|
+
// lm_head
|
6101
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6102
|
+
cb(cur, "result_output", -1);
|
6103
|
+
|
6104
|
+
ggml_build_forward_expand(gf, cur);
|
6105
|
+
|
6106
|
+
return gf;
|
6107
|
+
}
|
6108
|
+
|
5625
6109
|
struct ggml_cgraph * build_phi2() {
|
5626
6110
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5627
6111
|
|
@@ -5634,20 +6118,20 @@ struct llm_build_context {
|
|
5634
6118
|
struct ggml_tensor * ffn_output;
|
5635
6119
|
struct ggml_tensor * inpL;
|
5636
6120
|
|
5637
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
6121
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5638
6122
|
cb(inpL, "inp_embd", -1);
|
5639
6123
|
|
5640
6124
|
// inp_pos - contains the positions
|
5641
|
-
struct ggml_tensor * inp_pos =
|
6125
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5642
6126
|
cb(inp_pos, "inp_pos", -1);
|
5643
6127
|
|
5644
6128
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5645
|
-
struct ggml_tensor * KQ_mask =
|
6129
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5646
6130
|
cb(KQ_mask, "KQ_mask", -1);
|
5647
6131
|
|
5648
6132
|
// shift the entire K-cache if needed
|
5649
6133
|
if (do_rope_shift) {
|
5650
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6134
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5651
6135
|
}
|
5652
6136
|
|
5653
6137
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5703,11 +6187,9 @@ struct llm_build_context {
|
|
5703
6187
|
);
|
5704
6188
|
cb(Kcur, "Kcur", il);
|
5705
6189
|
|
5706
|
-
|
5707
|
-
|
5708
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
6190
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5709
6191
|
model.layers[il].wo, model.layers[il].bo,
|
5710
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
|
6192
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
|
5711
6193
|
cb(cur, "kqv_out", il);
|
5712
6194
|
}
|
5713
6195
|
|
@@ -5758,20 +6240,20 @@ struct llm_build_context {
|
|
5758
6240
|
struct ggml_tensor * cur;
|
5759
6241
|
struct ggml_tensor * inpL;
|
5760
6242
|
|
5761
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
6243
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5762
6244
|
cb(inpL, "inp_embd", -1);
|
5763
6245
|
|
5764
6246
|
// inp_pos - contains the positions
|
5765
|
-
struct ggml_tensor * inp_pos =
|
6247
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5766
6248
|
cb(inp_pos, "inp_pos", -1);
|
5767
6249
|
|
5768
6250
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5769
|
-
struct ggml_tensor * KQ_mask =
|
6251
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5770
6252
|
cb(KQ_mask, "KQ_mask", -1);
|
5771
6253
|
|
5772
6254
|
// shift the entire K-cache if needed
|
5773
6255
|
if (do_rope_shift) {
|
5774
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6256
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5775
6257
|
}
|
5776
6258
|
|
5777
6259
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5808,11 +6290,9 @@ struct llm_build_context {
|
|
5808
6290
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
5809
6291
|
cb(Kcur, "Kcur", il);
|
5810
6292
|
|
5811
|
-
|
5812
|
-
|
5813
|
-
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
6293
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5814
6294
|
model.layers[il].wo, NULL,
|
5815
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6295
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5816
6296
|
cb(cur, "kqv_out", il);
|
5817
6297
|
}
|
5818
6298
|
struct ggml_tensor * sa_out = cur;
|
@@ -5867,15 +6347,15 @@ struct llm_build_context {
|
|
5867
6347
|
struct ggml_tensor * pos;
|
5868
6348
|
struct ggml_tensor * inpL;
|
5869
6349
|
|
5870
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
6350
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5871
6351
|
cb(inpL, "inp_embd", -1);
|
5872
6352
|
|
5873
6353
|
// inp_pos - contains the positions
|
5874
|
-
struct ggml_tensor * inp_pos =
|
6354
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5875
6355
|
cb(inp_pos, "inp_pos", -1);
|
5876
6356
|
|
5877
6357
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5878
|
-
struct ggml_tensor * KQ_mask =
|
6358
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5879
6359
|
cb(KQ_mask, "KQ_mask", -1);
|
5880
6360
|
|
5881
6361
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
@@ -5903,51 +6383,396 @@ struct llm_build_context {
|
|
5903
6383
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5904
6384
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5905
6385
|
|
5906
|
-
cb(Qcur, "Qcur", il);
|
6386
|
+
cb(Qcur, "Qcur", il);
|
6387
|
+
cb(Kcur, "Kcur", il);
|
6388
|
+
cb(Vcur, "Vcur", il);
|
6389
|
+
|
6390
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6391
|
+
|
6392
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6393
|
+
model.layers[il].wo, model.layers[il].bo,
|
6394
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6395
|
+
cb(cur, "kqv_out", il);
|
6396
|
+
}
|
6397
|
+
|
6398
|
+
// add the input
|
6399
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6400
|
+
cb(ffn_inp, "ffn_inp", il);
|
6401
|
+
|
6402
|
+
// FF
|
6403
|
+
{
|
6404
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6405
|
+
model.layers[il].ffn_norm,
|
6406
|
+
model.layers[il].ffn_norm_b,
|
6407
|
+
LLM_NORM, cb, il);
|
6408
|
+
cb(cur, "ffn_norm", il);
|
6409
|
+
|
6410
|
+
cur = llm_build_ffn(ctx0, cur,
|
6411
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
6412
|
+
NULL, NULL,
|
6413
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
6414
|
+
NULL,
|
6415
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
6416
|
+
cb(cur, "ffn_out", il);
|
6417
|
+
}
|
6418
|
+
|
6419
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
6420
|
+
cb(inpL, "l_out", il);
|
6421
|
+
}
|
6422
|
+
|
6423
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6424
|
+
model.output_norm,
|
6425
|
+
model.output_norm_b,
|
6426
|
+
LLM_NORM, cb, -1);
|
6427
|
+
cb(cur, "result_norm", -1);
|
6428
|
+
|
6429
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6430
|
+
cb(cur, "result_output", -1);
|
6431
|
+
|
6432
|
+
ggml_build_forward_expand(gf, cur);
|
6433
|
+
|
6434
|
+
return gf;
|
6435
|
+
}
|
6436
|
+
|
6437
|
+
struct ggml_cgraph * build_codeshell() {
|
6438
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6439
|
+
|
6440
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6441
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
6442
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6443
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6444
|
+
|
6445
|
+
struct ggml_tensor * cur;
|
6446
|
+
struct ggml_tensor * inpL;
|
6447
|
+
|
6448
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6449
|
+
cb(inpL, "inp_embd", -1);
|
6450
|
+
|
6451
|
+
// inp_pos - contains the positions
|
6452
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6453
|
+
cb(inp_pos, "inp_pos", -1);
|
6454
|
+
|
6455
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6456
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6457
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6458
|
+
|
6459
|
+
// shift the entire K-cache if needed
|
6460
|
+
if (do_rope_shift) {
|
6461
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6462
|
+
}
|
6463
|
+
|
6464
|
+
for (int il = 0; il < n_layer; ++il) {
|
6465
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6466
|
+
model.layers[il].attn_norm,
|
6467
|
+
model.layers[il].attn_norm_b,
|
6468
|
+
LLM_NORM, cb, il);
|
6469
|
+
cb(cur, "attn_norm", il);
|
6470
|
+
|
6471
|
+
// self-attention
|
6472
|
+
{
|
6473
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
6474
|
+
cb(cur, "wqkv", il);
|
6475
|
+
|
6476
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
6477
|
+
cb(cur, "bqkv", il);
|
6478
|
+
|
6479
|
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
6480
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
6481
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
6482
|
+
|
6483
|
+
cb(tmpq, "tmpq", il);
|
6484
|
+
cb(tmpk, "tmpk", il);
|
6485
|
+
cb(Vcur, "Vcur", il);
|
6486
|
+
|
6487
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(
|
6488
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
6489
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6490
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6491
|
+
);
|
6492
|
+
cb(Qcur, "Qcur", il);
|
6493
|
+
|
6494
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(
|
6495
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6496
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6497
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6498
|
+
);
|
6499
|
+
cb(Kcur, "Kcur", il);
|
6500
|
+
|
6501
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6502
|
+
model.layers[il].wo, model.layers[il].bo,
|
6503
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6504
|
+
cb(cur, "kqv_out", il);
|
6505
|
+
}
|
6506
|
+
|
6507
|
+
// add the input
|
6508
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6509
|
+
cb(ffn_inp, "ffn_inp", il);
|
6510
|
+
|
6511
|
+
// FF
|
6512
|
+
{
|
6513
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6514
|
+
model.layers[il].ffn_norm,
|
6515
|
+
model.layers[il].ffn_norm_b,
|
6516
|
+
LLM_NORM, cb, il);
|
6517
|
+
cb(cur, "ffn_norm", il);
|
6518
|
+
|
6519
|
+
cur = llm_build_ffn(ctx0, cur,
|
6520
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
6521
|
+
NULL, NULL,
|
6522
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
6523
|
+
NULL,
|
6524
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
6525
|
+
cb(cur, "ffn_out", il);
|
6526
|
+
}
|
6527
|
+
|
6528
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
6529
|
+
cb(inpL, "l_out", il);
|
6530
|
+
}
|
6531
|
+
|
6532
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6533
|
+
model.output_norm,
|
6534
|
+
model.output_norm_b,
|
6535
|
+
LLM_NORM, cb, -1);
|
6536
|
+
cb(cur, "result_norm", -1);
|
6537
|
+
|
6538
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6539
|
+
cb(cur, "result_output", -1);
|
6540
|
+
|
6541
|
+
ggml_build_forward_expand(gf, cur);
|
6542
|
+
|
6543
|
+
return gf;
|
6544
|
+
}
|
6545
|
+
|
6546
|
+
struct ggml_cgraph * build_orion() {
|
6547
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6548
|
+
|
6549
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6550
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6551
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6552
|
+
|
6553
|
+
struct ggml_tensor * cur;
|
6554
|
+
struct ggml_tensor * inpL;
|
6555
|
+
|
6556
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6557
|
+
cb(inpL, "inp_embd", -1);
|
6558
|
+
|
6559
|
+
// inp_pos - contains the positions
|
6560
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6561
|
+
cb(inp_pos, "inp_pos", -1);
|
6562
|
+
|
6563
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6564
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6565
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6566
|
+
|
6567
|
+
// shift the entire K-cache if needed
|
6568
|
+
if (do_rope_shift) {
|
6569
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6570
|
+
}
|
6571
|
+
|
6572
|
+
for (int il = 0; il < n_layer; ++il) {
|
6573
|
+
struct ggml_tensor * inpSA = inpL;
|
6574
|
+
|
6575
|
+
// norm
|
6576
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6577
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
6578
|
+
LLM_NORM, cb, il);
|
6579
|
+
cb(cur, "attn_norm", il);
|
6580
|
+
|
6581
|
+
// self-attention
|
6582
|
+
{
|
6583
|
+
// compute Q and K and RoPE them
|
6584
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6585
|
+
cb(Qcur, "Qcur", il);
|
6586
|
+
// if (model.layers[il].bq) {
|
6587
|
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6588
|
+
// cb(Qcur, "Qcur", il);
|
6589
|
+
// }
|
6590
|
+
|
6591
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6592
|
+
cb(Kcur, "Kcur", il);
|
6593
|
+
// if (model.layers[il].bk) {
|
6594
|
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6595
|
+
// cb(Kcur, "Kcur", il);
|
6596
|
+
// }
|
6597
|
+
|
6598
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6599
|
+
cb(Vcur, "Vcur", il);
|
6600
|
+
// if (model.layers[il].bv) {
|
6601
|
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6602
|
+
// cb(Vcur, "Vcur", il);
|
6603
|
+
// }
|
6604
|
+
|
6605
|
+
Qcur = ggml_rope_custom(
|
6606
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6607
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6608
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6609
|
+
);
|
6610
|
+
cb(Qcur, "Qcur", il);
|
6611
|
+
|
6612
|
+
Kcur = ggml_rope_custom(
|
6613
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6614
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6615
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6616
|
+
);
|
6617
|
+
cb(Kcur, "Kcur", il);
|
6618
|
+
|
6619
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6620
|
+
model.layers[il].wo, NULL,
|
6621
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6622
|
+
cb(cur, "kqv_out", il);
|
6623
|
+
}
|
6624
|
+
|
6625
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6626
|
+
cb(ffn_inp, "ffn_inp", il);
|
6627
|
+
|
6628
|
+
// feed-forward network
|
6629
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6630
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
6631
|
+
LLM_NORM, cb, il);
|
6632
|
+
cb(cur, "ffn_norm", il);
|
6633
|
+
|
6634
|
+
cur = llm_build_ffn(ctx0, cur,
|
6635
|
+
model.layers[il].ffn_up, NULL,
|
6636
|
+
model.layers[il].ffn_gate, NULL,
|
6637
|
+
model.layers[il].ffn_down, NULL,
|
6638
|
+
NULL,
|
6639
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6640
|
+
cb(cur, "ffn_out", il);
|
6641
|
+
|
6642
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6643
|
+
cb(cur, "l_out", il);
|
6644
|
+
|
6645
|
+
// input for next layer
|
6646
|
+
inpL = cur;
|
6647
|
+
}
|
6648
|
+
|
6649
|
+
cur = inpL;
|
6650
|
+
|
6651
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6652
|
+
model.output_norm, model.output_norm_b,
|
6653
|
+
LLM_NORM, cb, -1);
|
6654
|
+
cb(cur, "result_norm", -1);
|
6655
|
+
|
6656
|
+
// lm_head
|
6657
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6658
|
+
cb(cur, "result_output", -1);
|
6659
|
+
|
6660
|
+
ggml_build_forward_expand(gf, cur);
|
6661
|
+
|
6662
|
+
return gf;
|
6663
|
+
}
|
6664
|
+
|
6665
|
+
struct ggml_cgraph * build_internlm2() {
|
6666
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6667
|
+
|
6668
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6669
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6670
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6671
|
+
|
6672
|
+
struct ggml_tensor * cur;
|
6673
|
+
struct ggml_tensor * inpL;
|
6674
|
+
|
6675
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6676
|
+
cb(inpL, "inp_embd", -1);
|
6677
|
+
|
6678
|
+
// inp_pos - contains the positions
|
6679
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6680
|
+
cb(inp_pos, "inp_pos", -1);
|
6681
|
+
|
6682
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6683
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6684
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6685
|
+
|
6686
|
+
// shift the entire K-cache if needed
|
6687
|
+
if (do_rope_shift) {
|
6688
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6689
|
+
}
|
6690
|
+
|
6691
|
+
for (int il = 0; il < n_layer; ++il) {
|
6692
|
+
struct ggml_tensor * inpSA = inpL;
|
6693
|
+
|
6694
|
+
// norm
|
6695
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6696
|
+
model.layers[il].attn_norm, NULL,
|
6697
|
+
LLM_NORM_RMS, cb, il);
|
6698
|
+
cb(cur, "attn_norm", il);
|
6699
|
+
|
6700
|
+
// self-attention
|
6701
|
+
{
|
6702
|
+
// compute Q and K and RoPE them
|
6703
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6704
|
+
cb(Qcur, "Qcur", il);
|
6705
|
+
if (model.layers[il].bq) {
|
6706
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6707
|
+
cb(Qcur, "Qcur", il);
|
6708
|
+
}
|
6709
|
+
|
6710
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
5907
6711
|
cb(Kcur, "Kcur", il);
|
6712
|
+
if (model.layers[il].bk) {
|
6713
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6714
|
+
cb(Kcur, "Kcur", il);
|
6715
|
+
}
|
6716
|
+
|
6717
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
5908
6718
|
cb(Vcur, "Vcur", il);
|
6719
|
+
if (model.layers[il].bv) {
|
6720
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6721
|
+
cb(Vcur, "Vcur", il);
|
6722
|
+
}
|
5909
6723
|
|
5910
|
-
Qcur =
|
6724
|
+
Qcur = ggml_rope_custom(
|
6725
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6726
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6727
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6728
|
+
);
|
6729
|
+
cb(Qcur, "Qcur", il);
|
5911
6730
|
|
5912
|
-
|
6731
|
+
Kcur = ggml_rope_custom(
|
6732
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6733
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6734
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6735
|
+
);
|
6736
|
+
cb(Kcur, "Kcur", il);
|
5913
6737
|
|
5914
|
-
cur =
|
6738
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5915
6739
|
model.layers[il].wo, model.layers[il].bo,
|
5916
|
-
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6740
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5917
6741
|
cb(cur, "kqv_out", il);
|
5918
6742
|
}
|
5919
6743
|
|
5920
|
-
|
5921
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
6744
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5922
6745
|
cb(ffn_inp, "ffn_inp", il);
|
5923
6746
|
|
5924
|
-
//
|
5925
|
-
|
5926
|
-
|
5927
|
-
|
5928
|
-
|
5929
|
-
LLM_NORM, cb, il);
|
5930
|
-
cb(cur, "ffn_norm", il);
|
6747
|
+
// feed-forward network
|
6748
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6749
|
+
model.layers[il].ffn_norm, NULL,
|
6750
|
+
LLM_NORM_RMS, cb, il);
|
6751
|
+
cb(cur, "ffn_norm", il);
|
5931
6752
|
|
5932
|
-
|
5933
|
-
|
5934
|
-
|
5935
|
-
|
5936
|
-
|
5937
|
-
|
5938
|
-
|
5939
|
-
}
|
6753
|
+
cur = llm_build_ffn(ctx0, cur,
|
6754
|
+
model.layers[il].ffn_up, NULL,
|
6755
|
+
model.layers[il].ffn_gate, NULL,
|
6756
|
+
model.layers[il].ffn_down, NULL,
|
6757
|
+
NULL,
|
6758
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6759
|
+
cb(cur, "ffn_out", il);
|
5940
6760
|
|
5941
|
-
|
5942
|
-
cb(
|
6761
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6762
|
+
cb(cur, "l_out", il);
|
6763
|
+
|
6764
|
+
// input for next layer
|
6765
|
+
inpL = cur;
|
5943
6766
|
}
|
5944
6767
|
|
5945
|
-
cur =
|
5946
|
-
|
5947
|
-
|
5948
|
-
|
6768
|
+
cur = inpL;
|
6769
|
+
|
6770
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6771
|
+
model.output_norm, NULL,
|
6772
|
+
LLM_NORM_RMS, cb, -1);
|
5949
6773
|
cb(cur, "result_norm", -1);
|
5950
6774
|
|
6775
|
+
// lm_head
|
5951
6776
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5952
6777
|
cb(cur, "result_output", -1);
|
5953
6778
|
|
@@ -5955,6 +6780,7 @@ struct llm_build_context {
|
|
5955
6780
|
|
5956
6781
|
return gf;
|
5957
6782
|
}
|
6783
|
+
|
5958
6784
|
};
|
5959
6785
|
|
5960
6786
|
static struct ggml_cgraph * llama_build_graph(
|
@@ -5965,15 +6791,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5965
6791
|
// check if we should build the worst-case graph (for memory measurement)
|
5966
6792
|
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
5967
6793
|
|
5968
|
-
// keep track of the input that has already been allocated
|
5969
|
-
bool alloc_inp_tokens = false;
|
5970
|
-
bool alloc_inp_embd = false;
|
5971
|
-
bool alloc_inp_pos = false;
|
5972
|
-
bool alloc_inp_KQ_mask = false;
|
5973
|
-
bool alloc_inp_K_shift = false;
|
5974
|
-
|
5975
6794
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
5976
|
-
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
5977
6795
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
5978
6796
|
if (il >= 0) {
|
5979
6797
|
ggml_format_name(cur, "%s-%d", name, il);
|
@@ -5981,118 +6799,78 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5981
6799
|
ggml_set_name(cur, name);
|
5982
6800
|
}
|
5983
6801
|
|
5984
|
-
|
5985
|
-
|
5986
|
-
|
5987
|
-
|
5988
|
-
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
|
5989
|
-
ggml_tallocr_alloc(lctx.alloc, cur);
|
5990
|
-
|
5991
|
-
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
|
5992
|
-
const int64_t n_tokens = cur->ne[0];
|
5993
|
-
|
5994
|
-
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
6802
|
+
if (!lctx.cparams.offload_kqv) {
|
6803
|
+
if (strcmp(name, "kqv_merged_cont") == 0) {
|
6804
|
+
// all nodes between the KV store and the attention output are run on the CPU
|
6805
|
+
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
5995
6806
|
}
|
5996
|
-
|
5997
|
-
alloc_inp_tokens = true;
|
5998
6807
|
}
|
6808
|
+
};
|
5999
6809
|
|
6000
|
-
|
6001
|
-
ggml_tallocr_alloc(lctx.alloc, cur);
|
6810
|
+
struct ggml_cgraph * result = NULL;
|
6002
6811
|
|
6003
|
-
|
6004
|
-
const int64_t n_embd = cur->ne[0];
|
6005
|
-
const int64_t n_tokens = cur->ne[1];
|
6812
|
+
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
6006
6813
|
|
6007
|
-
|
6008
|
-
|
6814
|
+
//
|
6815
|
+
// set input data
|
6816
|
+
//
|
6817
|
+
|
6818
|
+
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
6819
|
+
if (batch.token) {
|
6820
|
+
const int64_t n_tokens = batch.n_tokens;
|
6009
6821
|
|
6010
|
-
|
6822
|
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
6011
6823
|
}
|
6012
6824
|
|
6013
|
-
if (
|
6014
|
-
|
6825
|
+
if (batch.embd) {
|
6826
|
+
const int64_t n_embd = llm.n_embd;
|
6827
|
+
const int64_t n_tokens = batch.n_tokens;
|
6015
6828
|
|
6016
|
-
|
6017
|
-
|
6829
|
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
6830
|
+
}
|
6018
6831
|
|
6019
|
-
|
6020
|
-
|
6021
|
-
}
|
6832
|
+
if (batch.pos) {
|
6833
|
+
const int64_t n_tokens = batch.n_tokens;
|
6022
6834
|
|
6023
|
-
|
6835
|
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
6024
6836
|
}
|
6025
6837
|
|
6026
|
-
|
6027
|
-
|
6838
|
+
{
|
6839
|
+
const int64_t n_kv = llm.n_kv;
|
6840
|
+
const int64_t n_tokens = batch.n_tokens;
|
6028
6841
|
|
6029
|
-
|
6030
|
-
|
6031
|
-
const int64_t n_tokens = cur->ne[1];
|
6842
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
6843
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
6032
6844
|
|
6033
|
-
|
6034
|
-
|
6035
|
-
|
6036
|
-
|
6037
|
-
lctx.buf_copy.resize(ggml_nbytes(cur));
|
6038
|
-
data = (float *) lctx.buf_copy.data();
|
6039
|
-
}
|
6845
|
+
for (int h = 0; h < 1; ++h) {
|
6846
|
+
for (int j = 0; j < n_tokens; ++j) {
|
6847
|
+
const llama_pos pos = batch.pos[j];
|
6848
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
6040
6849
|
|
6041
|
-
|
6042
|
-
|
6043
|
-
|
6044
|
-
|
6045
|
-
|
6046
|
-
|
6047
|
-
float f;
|
6048
|
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
6049
|
-
f = -INFINITY;
|
6050
|
-
} else {
|
6051
|
-
f = 0;
|
6052
|
-
}
|
6053
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
6850
|
+
for (int i = 0; i < n_kv; ++i) {
|
6851
|
+
float f;
|
6852
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
6853
|
+
f = -INFINITY;
|
6854
|
+
} else {
|
6855
|
+
f = 0;
|
6054
6856
|
}
|
6857
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
6055
6858
|
}
|
6056
6859
|
}
|
6057
|
-
|
6058
|
-
if (data != cur->data) {
|
6059
|
-
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
6060
|
-
}
|
6061
6860
|
}
|
6062
|
-
|
6063
|
-
alloc_inp_KQ_mask = true;
|
6064
6861
|
}
|
6065
6862
|
|
6066
|
-
if (
|
6067
|
-
|
6068
|
-
|
6069
|
-
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
6070
|
-
const int64_t n_ctx = cur->ne[0];
|
6071
|
-
|
6072
|
-
int32_t * data;
|
6073
|
-
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
6074
|
-
data = (int32_t *) cur->data;
|
6075
|
-
} else {
|
6076
|
-
lctx.buf_copy.resize(ggml_nbytes(cur));
|
6077
|
-
data = (int32_t *) lctx.buf_copy.data();
|
6078
|
-
}
|
6863
|
+
if (llm.do_rope_shift) {
|
6864
|
+
const int64_t n_ctx = llm.n_ctx;
|
6079
6865
|
|
6080
|
-
|
6081
|
-
|
6082
|
-
}
|
6866
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
6867
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
6083
6868
|
|
6084
|
-
|
6085
|
-
|
6086
|
-
}
|
6869
|
+
for (int i = 0; i < n_ctx; ++i) {
|
6870
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
6087
6871
|
}
|
6088
|
-
|
6089
|
-
alloc_inp_K_shift = true;
|
6090
6872
|
}
|
6091
|
-
}
|
6092
|
-
|
6093
|
-
struct ggml_cgraph * result = NULL;
|
6094
|
-
|
6095
|
-
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
6873
|
+
}
|
6096
6874
|
|
6097
6875
|
llm.init();
|
6098
6876
|
|
@@ -6137,6 +6915,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6137
6915
|
{
|
6138
6916
|
result = llm.build_qwen();
|
6139
6917
|
} break;
|
6918
|
+
case LLM_ARCH_QWEN2:
|
6919
|
+
{
|
6920
|
+
result = llm.build_qwen2();
|
6921
|
+
} break;
|
6140
6922
|
case LLM_ARCH_PHI2:
|
6141
6923
|
{
|
6142
6924
|
result = llm.build_phi2();
|
@@ -6149,6 +6931,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6149
6931
|
{
|
6150
6932
|
result = llm.build_gpt2();
|
6151
6933
|
} break;
|
6934
|
+
case LLM_ARCH_CODESHELL:
|
6935
|
+
{
|
6936
|
+
result = llm.build_codeshell();
|
6937
|
+
} break;
|
6938
|
+
case LLM_ARCH_ORION:
|
6939
|
+
{
|
6940
|
+
result = llm.build_orion();
|
6941
|
+
} break;
|
6942
|
+
case LLM_ARCH_INTERNLM2:
|
6943
|
+
{
|
6944
|
+
result = llm.build_internlm2();
|
6945
|
+
} break;
|
6152
6946
|
default:
|
6153
6947
|
GGML_ASSERT(false);
|
6154
6948
|
}
|
@@ -6254,6 +7048,7 @@ static int llama_decode_internal(
|
|
6254
7048
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
6255
7049
|
|
6256
7050
|
ggml_backend_sched_reset(lctx.sched);
|
7051
|
+
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
6257
7052
|
|
6258
7053
|
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
6259
7054
|
|
@@ -6279,11 +7074,6 @@ static int llama_decode_internal(
|
|
6279
7074
|
n_threads = std::min(4, n_threads);
|
6280
7075
|
}
|
6281
7076
|
|
6282
|
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
6283
|
-
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
6284
|
-
n_threads = 1;
|
6285
|
-
}
|
6286
|
-
|
6287
7077
|
#ifdef GGML_USE_MPI
|
6288
7078
|
const int64_t n_layer = hparams.n_layer;
|
6289
7079
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
@@ -7095,7 +7885,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7095
7885
|
//
|
7096
7886
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
7097
7887
|
if (&fragment == &fragment_buffer.front()) {
|
7098
|
-
|
7888
|
+
if (vocab.add_space_prefix) {
|
7889
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
7890
|
+
}
|
7099
7891
|
}
|
7100
7892
|
|
7101
7893
|
#ifdef PRETOKENIZERDEBUG
|
@@ -7574,6 +8366,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
|
7574
8366
|
}
|
7575
8367
|
|
7576
8368
|
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
8369
|
+
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
8370
|
+
// if (k >= (int32_t)candidates->size) {
|
8371
|
+
// return;
|
8372
|
+
// }
|
8373
|
+
|
7577
8374
|
const int64_t t_start_sample_us = ggml_time_us();
|
7578
8375
|
|
7579
8376
|
k = std::max(k, (int) min_keep);
|
@@ -7584,10 +8381,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
|
|
7584
8381
|
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
7585
8382
|
return a.logit > b.logit;
|
7586
8383
|
};
|
7587
|
-
if (k
|
7588
|
-
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
7589
|
-
} else {
|
8384
|
+
if (k <= 128) {
|
7590
8385
|
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
8386
|
+
} else {
|
8387
|
+
constexpr int nbuckets = 128;
|
8388
|
+
constexpr float bucket_low = -10.0f;
|
8389
|
+
constexpr float bucket_high = 10.0f;
|
8390
|
+
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
8391
|
+
constexpr float bucker_inter = -bucket_low * bucket_scale;
|
8392
|
+
|
8393
|
+
std::vector<int> bucket_idx(candidates->size);
|
8394
|
+
std::vector<int> histo(nbuckets, 0);
|
8395
|
+
|
8396
|
+
for (int i = 0; i < (int)candidates->size; ++i) {
|
8397
|
+
const float val = candidates->data[i].logit;
|
8398
|
+
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
8399
|
+
ib = std::max(0, std::min(nbuckets-1, ib));
|
8400
|
+
bucket_idx[i] = ib;
|
8401
|
+
++histo[ib];
|
8402
|
+
}
|
8403
|
+
int nhave = 0;
|
8404
|
+
int ib = nbuckets - 1;
|
8405
|
+
for ( ; ib >= 0; --ib) {
|
8406
|
+
nhave += histo[ib];
|
8407
|
+
if (nhave >= k) break;
|
8408
|
+
}
|
8409
|
+
std::vector<llama_token_data> tmp_tokens(nhave);
|
8410
|
+
auto ptr = tmp_tokens.data();
|
8411
|
+
std::vector<llama_token_data*> bucket_ptrs;
|
8412
|
+
bucket_ptrs.reserve(nbuckets - ib);
|
8413
|
+
for (int j = nbuckets - 1; j >= ib; --j) {
|
8414
|
+
bucket_ptrs.push_back(ptr);
|
8415
|
+
ptr += histo[j];
|
8416
|
+
}
|
8417
|
+
for (int i = 0; i < (int)candidates->size; ++i) {
|
8418
|
+
int j = bucket_idx[i];
|
8419
|
+
if (j >= ib) {
|
8420
|
+
*bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
|
8421
|
+
}
|
8422
|
+
}
|
8423
|
+
|
8424
|
+
ptr = tmp_tokens.data();
|
8425
|
+
int ndone = 0;
|
8426
|
+
for (int j = nbuckets-1; j > ib; --j) {
|
8427
|
+
std::sort(ptr, ptr + histo[j], comp);
|
8428
|
+
ptr += histo[j];
|
8429
|
+
ndone += histo[j];
|
8430
|
+
}
|
8431
|
+
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
8432
|
+
|
8433
|
+
std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
8434
|
+
|
7591
8435
|
}
|
7592
8436
|
candidates->sorted = true;
|
7593
8437
|
}
|
@@ -7635,21 +8479,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
|
7635
8479
|
return;
|
7636
8480
|
}
|
7637
8481
|
|
7638
|
-
llama_sample_softmax(ctx, candidates);
|
7639
|
-
|
7640
8482
|
const int64_t t_start_sample_us = ggml_time_us();
|
7641
8483
|
|
7642
|
-
|
7643
|
-
|
8484
|
+
bool min_p_applied = false;
|
8485
|
+
|
8486
|
+
// if the candidates aren't sorted, try the unsorted implementation first
|
8487
|
+
if (!candidates->sorted) {
|
8488
|
+
std::vector<llama_token_data> filtered_tokens;
|
8489
|
+
|
8490
|
+
float max_logit = -FLT_MAX;
|
8491
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
8492
|
+
max_logit = std::max(max_logit, candidates->data[i].logit);
|
8493
|
+
}
|
8494
|
+
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
8495
|
+
|
8496
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
8497
|
+
if (candidates->data[i].logit >= min_logit) {
|
8498
|
+
filtered_tokens.push_back(candidates->data[i]);
|
8499
|
+
}
|
8500
|
+
}
|
7644
8501
|
|
7645
|
-
|
7646
|
-
if (
|
7647
|
-
|
8502
|
+
// if we have enough values the operation was a success
|
8503
|
+
if (filtered_tokens.size() >= min_keep) {
|
8504
|
+
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
8505
|
+
candidates->size = filtered_tokens.size();
|
8506
|
+
min_p_applied = true;
|
7648
8507
|
}
|
7649
8508
|
}
|
7650
8509
|
|
7651
|
-
//
|
7652
|
-
|
8510
|
+
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
8511
|
+
if (!min_p_applied) {
|
8512
|
+
// Sort the logits in descending order
|
8513
|
+
if (!candidates->sorted) {
|
8514
|
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
8515
|
+
return a.logit > b.logit;
|
8516
|
+
});
|
8517
|
+
candidates->sorted = true;
|
8518
|
+
}
|
8519
|
+
|
8520
|
+
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
8521
|
+
size_t i = 1; // first token always matches
|
8522
|
+
|
8523
|
+
for (; i < candidates->size; ++i) {
|
8524
|
+
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
8525
|
+
break; // prob too small
|
8526
|
+
}
|
8527
|
+
}
|
8528
|
+
|
8529
|
+
// Resize the output vector to keep only the matching tokens
|
8530
|
+
candidates->size = i;
|
8531
|
+
}
|
7653
8532
|
|
7654
8533
|
if (ctx) {
|
7655
8534
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -7779,6 +8658,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
7779
8658
|
}
|
7780
8659
|
}
|
7781
8660
|
|
8661
|
+
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
|
8662
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
8663
|
+
|
8664
|
+
// no need to do anything if there is only one (or zero) candidates
|
8665
|
+
if(candidates_p->size <= 1) {
|
8666
|
+
return;
|
8667
|
+
}
|
8668
|
+
|
8669
|
+
// Calculate maximum possible entropy
|
8670
|
+
float max_entropy = -logf(1.0f / candidates_p->size);
|
8671
|
+
|
8672
|
+
llama_sample_softmax(nullptr, candidates_p);
|
8673
|
+
|
8674
|
+
// Calculate entropy of the softmax probabilities
|
8675
|
+
float entropy = 0.0f;
|
8676
|
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
8677
|
+
float prob = candidates_p->data[i].p;
|
8678
|
+
if (prob > 0.0f) { // Ensure no log(0)
|
8679
|
+
entropy -= prob * logf(prob);
|
8680
|
+
}
|
8681
|
+
}
|
8682
|
+
|
8683
|
+
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
|
8684
|
+
float normalized_entropy = entropy / max_entropy;
|
8685
|
+
|
8686
|
+
// Map the normalized entropy to the desired temperature range using the power function
|
8687
|
+
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
8688
|
+
|
8689
|
+
#ifdef DEBUG
|
8690
|
+
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
|
8691
|
+
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
|
8692
|
+
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
|
8693
|
+
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
|
8694
|
+
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
|
8695
|
+
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
8696
|
+
#endif
|
8697
|
+
|
8698
|
+
// Apply the dynamically calculated temperature scaling
|
8699
|
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
8700
|
+
candidates_p->data[i].logit /= dyn_temp;
|
8701
|
+
}
|
8702
|
+
|
8703
|
+
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
8704
|
+
double max_l_double = candidates_p->data[0].logit;
|
8705
|
+
double cum_sum_double = 0.0;
|
8706
|
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
8707
|
+
double p = exp(candidates_p->data[i].logit - max_l_double);
|
8708
|
+
candidates_p->data[i].p = p; // Store the scaled probability
|
8709
|
+
cum_sum_double += p;
|
8710
|
+
}
|
8711
|
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
8712
|
+
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
8713
|
+
}
|
8714
|
+
|
8715
|
+
#ifdef DEBUG
|
8716
|
+
// Print the updated top 25 probabilities after temperature scaling
|
8717
|
+
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
8718
|
+
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
|
8719
|
+
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
|
8720
|
+
}
|
8721
|
+
#endif
|
8722
|
+
|
8723
|
+
if (ctx) {
|
8724
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
8725
|
+
}
|
8726
|
+
}
|
8727
|
+
|
7782
8728
|
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
7783
8729
|
const int64_t t_start_sample_us = ggml_time_us();
|
7784
8730
|
|
@@ -8367,9 +9313,13 @@ struct quantize_state_internal {
|
|
8367
9313
|
const llama_model_quantize_params * params;
|
8368
9314
|
|
8369
9315
|
int n_attention_wv = 0;
|
8370
|
-
int
|
9316
|
+
int n_ffn_down = 0;
|
9317
|
+
int n_ffn_gate = 0;
|
9318
|
+
int n_ffn_up = 0;
|
8371
9319
|
int i_attention_wv = 0;
|
8372
|
-
int
|
9320
|
+
int i_ffn_down = 0;
|
9321
|
+
int i_ffn_gate = 0;
|
9322
|
+
int i_ffn_up = 0;
|
8373
9323
|
|
8374
9324
|
int n_k_quantized = 0;
|
8375
9325
|
int n_fallback = 0;
|
@@ -8453,6 +9403,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8453
9403
|
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
8454
9404
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
8455
9405
|
};
|
9406
|
+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
9407
|
+
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
9408
|
+
if (n_expert > 1) {
|
9409
|
+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
9410
|
+
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
9411
|
+
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
9412
|
+
// tensor name.
|
9413
|
+
n_layer /= n_expert;
|
9414
|
+
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
9415
|
+
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
9416
|
+
}
|
9417
|
+
if (i_layer < 0 || i_layer >= n_layer) {
|
9418
|
+
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
|
9419
|
+
}
|
9420
|
+
}
|
9421
|
+
return std::make_pair(i_layer, n_layer);
|
9422
|
+
};
|
8456
9423
|
|
8457
9424
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
8458
9425
|
int nx = tensor->ne[0];
|
@@ -8465,6 +9432,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8465
9432
|
else if (new_type != GGML_TYPE_Q8_0) {
|
8466
9433
|
new_type = GGML_TYPE_Q6_K;
|
8467
9434
|
}
|
9435
|
+
} else if (name == "token_embd.weight") {
|
9436
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
9437
|
+
new_type = GGML_TYPE_Q2_K;
|
9438
|
+
}
|
9439
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
9440
|
+
new_type = GGML_TYPE_Q4_K;
|
9441
|
+
}
|
8468
9442
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
8469
9443
|
if (name.find("attn_v.weight") != std::string::npos) {
|
8470
9444
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
@@ -8472,12 +9446,19 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8472
9446
|
++qs.i_attention_wv;
|
8473
9447
|
}
|
8474
9448
|
else if (name.find("ffn_down") != std::string::npos) {
|
8475
|
-
if (qs.
|
8476
|
-
++qs.
|
9449
|
+
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
9450
|
+
++qs.i_ffn_down;
|
8477
9451
|
}
|
8478
|
-
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
8479
9452
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
8480
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K)
|
9453
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
9454
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
9455
|
+
}
|
9456
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
9457
|
+
new_type = GGML_TYPE_Q4_K;
|
9458
|
+
}
|
9459
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
|
9460
|
+
new_type = GGML_TYPE_Q4_K;
|
9461
|
+
}
|
8481
9462
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
8482
9463
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
8483
9464
|
}
|
@@ -8505,29 +9486,19 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8505
9486
|
// TODO: explore better strategies
|
8506
9487
|
new_type = GGML_TYPE_Q8_0;
|
8507
9488
|
}
|
8508
|
-
|
8509
|
-
|
8510
|
-
int i_layer, n_layer;
|
8511
|
-
if (n_expert == 1) {
|
8512
|
-
i_layer = qs.i_feed_forward_w2;
|
8513
|
-
n_layer = qs.n_feed_forward_w2;
|
8514
|
-
} else {
|
8515
|
-
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
8516
|
-
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
|
8517
|
-
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
8518
|
-
// tensor name.
|
8519
|
-
n_layer = qs.n_feed_forward_w2 / n_expert;
|
8520
|
-
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
|
8521
|
-
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
|
8522
|
-
}
|
8523
|
-
if (i_layer < 0 || i_layer >= n_layer) {
|
8524
|
-
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
|
8525
|
-
}
|
9489
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
9490
|
+
new_type = GGML_TYPE_Q2_K;
|
8526
9491
|
}
|
9492
|
+
} else if (name.find("ffn_down") != std::string::npos) {
|
9493
|
+
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
9494
|
+
int i_layer = info.first, n_layer = info.second;
|
8527
9495
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8528
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
9496
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
8529
9497
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
8530
9498
|
}
|
9499
|
+
//else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
9500
|
+
// if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
9501
|
+
//}
|
8531
9502
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
8532
9503
|
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
8533
9504
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
@@ -8555,16 +9526,18 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8555
9526
|
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
8556
9527
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
8557
9528
|
}
|
8558
|
-
++qs.
|
9529
|
+
++qs.i_ffn_down;
|
8559
9530
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
8560
9531
|
if (arch != LLM_ARCH_FALCON) {
|
8561
9532
|
if (qs.model.hparams.n_expert == 8) {
|
8562
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype ==
|
9533
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
9534
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
8563
9535
|
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
8564
9536
|
new_type = GGML_TYPE_Q5_K;
|
8565
9537
|
}
|
8566
9538
|
} else {
|
8567
9539
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
9540
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
8568
9541
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
8569
9542
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
8570
9543
|
}
|
@@ -8577,6 +9550,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8577
9550
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
8578
9551
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
8579
9552
|
}
|
9553
|
+
else if (name.find("ffn_gate") != std::string::npos) {
|
9554
|
+
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
9555
|
+
int i_layer = info.first, n_layer = info.second;
|
9556
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
9557
|
+
new_type = GGML_TYPE_Q2_K;
|
9558
|
+
}
|
9559
|
+
++qs.i_ffn_gate;
|
9560
|
+
}
|
9561
|
+
else if (name.find("ffn_up") != std::string::npos) {
|
9562
|
+
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
9563
|
+
int i_layer = info.first, n_layer = info.second;
|
9564
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
9565
|
+
new_type = GGML_TYPE_Q2_K;
|
9566
|
+
}
|
9567
|
+
++qs.i_ffn_up;
|
9568
|
+
}
|
9569
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
9570
|
+
//}
|
8580
9571
|
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
8581
9572
|
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
8582
9573
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
@@ -8589,7 +9580,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8589
9580
|
bool convert_incompatible_tensor = false;
|
8590
9581
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
8591
9582
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
8592
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS
|
9583
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
9584
|
+
new_type == GGML_TYPE_IQ3_XXS) {
|
8593
9585
|
int nx = tensor->ne[0];
|
8594
9586
|
int ny = tensor->ne[1];
|
8595
9587
|
if (nx % QK_K != 0) {
|
@@ -8603,6 +9595,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8603
9595
|
switch (new_type) {
|
8604
9596
|
case GGML_TYPE_IQ2_XXS:
|
8605
9597
|
case GGML_TYPE_IQ2_XS:
|
9598
|
+
case GGML_TYPE_IQ3_XXS:
|
8606
9599
|
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
8607
9600
|
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
8608
9601
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
@@ -8631,8 +9624,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8631
9624
|
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
8632
9625
|
|
8633
9626
|
// K-quants
|
9627
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
8634
9628
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
8635
|
-
case
|
9629
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
8636
9630
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
8637
9631
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
8638
9632
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
@@ -8643,6 +9637,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8643
9637
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
8644
9638
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
8645
9639
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
9640
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
8646
9641
|
|
8647
9642
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
8648
9643
|
}
|
@@ -8700,12 +9695,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8700
9695
|
++qs.n_attention_wv;
|
8701
9696
|
}
|
8702
9697
|
else if (name.find("ffn_down") != std::string::npos) {
|
8703
|
-
++qs.
|
9698
|
+
++qs.n_ffn_down;
|
9699
|
+
}
|
9700
|
+
else if (name.find("ffn_gate") != std::string::npos) {
|
9701
|
+
++qs.n_ffn_gate;
|
9702
|
+
}
|
9703
|
+
else if (name.find("ffn_up") != std::string::npos) {
|
9704
|
+
++qs.n_ffn_up;
|
8704
9705
|
}
|
8705
9706
|
}
|
8706
|
-
if (qs.n_attention_wv != qs.
|
8707
|
-
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d,
|
8708
|
-
__func__, qs.n_attention_wv, qs.
|
9707
|
+
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
|
9708
|
+
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
|
9709
|
+
__func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
|
8709
9710
|
}
|
8710
9711
|
|
8711
9712
|
size_t total_size_org = 0;
|
@@ -8738,8 +9739,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8738
9739
|
// placeholder for the meta data
|
8739
9740
|
::zeros(fout, meta_size);
|
8740
9741
|
|
8741
|
-
std::set<ggml_type> used_iq2;
|
8742
|
-
|
8743
9742
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
8744
9743
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
8745
9744
|
|
@@ -8792,11 +9791,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8792
9791
|
} else {
|
8793
9792
|
const size_t nelements = ggml_nelements(tensor);
|
8794
9793
|
|
8795
|
-
if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
|
8796
|
-
ggml_init_iq2_quantization(new_type);
|
8797
|
-
used_iq2.insert(new_type);
|
8798
|
-
}
|
8799
|
-
|
8800
9794
|
const float * imatrix = nullptr;
|
8801
9795
|
if (imatrix_data) {
|
8802
9796
|
auto it = imatrix_data->find(tensor->name);
|
@@ -8922,10 +9916,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
8922
9916
|
|
8923
9917
|
fout.close();
|
8924
9918
|
|
8925
|
-
for (auto type : used_iq2) {
|
8926
|
-
ggml_deinit_iq2_quantization(type);
|
8927
|
-
}
|
8928
|
-
|
8929
9919
|
gguf_free(ctx_out);
|
8930
9920
|
|
8931
9921
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
@@ -9271,6 +10261,8 @@ struct llama_context_params llama_context_default_params() {
|
|
9271
10261
|
/*.yarn_beta_fast =*/ 32.0f,
|
9272
10262
|
/*.yarn_beta_slow =*/ 1.0f,
|
9273
10263
|
/*.yarn_orig_ctx =*/ 0,
|
10264
|
+
/*.cb_eval =*/ nullptr,
|
10265
|
+
/*.cb_eval_user_data =*/ nullptr,
|
9274
10266
|
/*.type_k =*/ GGML_TYPE_F16,
|
9275
10267
|
/*.type_v =*/ GGML_TYPE_F16,
|
9276
10268
|
/*.mul_mat_q =*/ true,
|
@@ -9296,18 +10288,45 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
9296
10288
|
return result;
|
9297
10289
|
}
|
9298
10290
|
|
9299
|
-
|
9300
|
-
|
10291
|
+
size_t llama_max_devices(void) {
|
10292
|
+
#if defined(GGML_USE_METAL)
|
10293
|
+
return 1;
|
10294
|
+
#elif defined(GGML_USE_CUBLAS)
|
10295
|
+
return GGML_CUDA_MAX_DEVICES;
|
10296
|
+
#elif defined(GGML_USE_SYCL)
|
10297
|
+
return GGML_SYCL_MAX_DEVICES;
|
10298
|
+
#else
|
10299
|
+
return 1;
|
10300
|
+
#endif
|
9301
10301
|
}
|
9302
10302
|
|
9303
|
-
bool
|
10303
|
+
bool llama_supports_mmap(void) {
|
9304
10304
|
return llama_mmap::SUPPORTED;
|
9305
10305
|
}
|
9306
10306
|
|
9307
|
-
bool
|
10307
|
+
bool llama_supports_mlock(void) {
|
9308
10308
|
return llama_mlock::SUPPORTED;
|
9309
10309
|
}
|
9310
10310
|
|
10311
|
+
bool llama_supports_gpu_offload(void) {
|
10312
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
10313
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
10314
|
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
10315
|
+
return true;
|
10316
|
+
#else
|
10317
|
+
return false;
|
10318
|
+
#endif
|
10319
|
+
}
|
10320
|
+
|
10321
|
+
// deprecated:
|
10322
|
+
bool llama_mmap_supported(void) {
|
10323
|
+
return llama_supports_mmap();
|
10324
|
+
}
|
10325
|
+
|
10326
|
+
bool llama_mlock_supported(void) {
|
10327
|
+
return llama_supports_mlock();
|
10328
|
+
}
|
10329
|
+
|
9311
10330
|
void llama_backend_init(bool numa) {
|
9312
10331
|
ggml_time_init();
|
9313
10332
|
|
@@ -9331,6 +10350,7 @@ void llama_backend_free(void) {
|
|
9331
10350
|
#ifdef GGML_USE_MPI
|
9332
10351
|
ggml_mpi_backend_free();
|
9333
10352
|
#endif
|
10353
|
+
ggml_quantize_free();
|
9334
10354
|
}
|
9335
10355
|
|
9336
10356
|
int64_t llama_time_us(void) {
|
@@ -9338,8 +10358,8 @@ int64_t llama_time_us(void) {
|
|
9338
10358
|
}
|
9339
10359
|
|
9340
10360
|
struct llama_model * llama_load_model_from_file(
|
9341
|
-
|
9342
|
-
|
10361
|
+
const char * path_model,
|
10362
|
+
struct llama_model_params params) {
|
9343
10363
|
ggml_time_init();
|
9344
10364
|
|
9345
10365
|
llama_model * model = new llama_model;
|
@@ -9411,6 +10431,9 @@ struct llama_context * llama_new_context_with_model(
|
|
9411
10431
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
9412
10432
|
hparams.n_ctx_train;
|
9413
10433
|
|
10434
|
+
cparams.cb_eval = params.cb_eval;
|
10435
|
+
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
10436
|
+
|
9414
10437
|
auto rope_scaling_type = params.rope_scaling_type;
|
9415
10438
|
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
9416
10439
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
@@ -9477,6 +10500,36 @@ struct llama_context * llama_new_context_with_model(
|
|
9477
10500
|
}
|
9478
10501
|
}
|
9479
10502
|
}
|
10503
|
+
#elif defined(GGML_USE_VULKAN)
|
10504
|
+
if (model->n_gpu_layers > 0) {
|
10505
|
+
ggml_backend_t backend = ggml_backend_vk_init();
|
10506
|
+
if (backend == nullptr) {
|
10507
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
10508
|
+
llama_free(ctx);
|
10509
|
+
return nullptr;
|
10510
|
+
}
|
10511
|
+
ctx->backends.push_back(backend);
|
10512
|
+
}
|
10513
|
+
#elif defined(GGML_USE_SYCL)
|
10514
|
+
if (model->n_gpu_layers > 0) {
|
10515
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
10516
|
+
if (backend == nullptr) {
|
10517
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
10518
|
+
llama_free(ctx);
|
10519
|
+
return nullptr;
|
10520
|
+
}
|
10521
|
+
ctx->backends.push_back(backend);
|
10522
|
+
}
|
10523
|
+
#elif defined(GGML_USE_KOMPUTE)
|
10524
|
+
if (model->n_gpu_layers > 0) {
|
10525
|
+
auto * backend = ggml_backend_kompute_init(model->main_gpu);
|
10526
|
+
if (backend == nullptr) {
|
10527
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
10528
|
+
llama_free(ctx);
|
10529
|
+
return nullptr;
|
10530
|
+
}
|
10531
|
+
ctx->backends.push_back(backend);
|
10532
|
+
}
|
9480
10533
|
#endif
|
9481
10534
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
9482
10535
|
if (ctx->backend_cpu == nullptr) {
|
@@ -9518,6 +10571,35 @@ struct llama_context * llama_new_context_with_model(
|
|
9518
10571
|
ctx->embedding.resize(hparams.n_embd);
|
9519
10572
|
}
|
9520
10573
|
|
10574
|
+
// graph inputs
|
10575
|
+
{
|
10576
|
+
ggml_init_params init_params = {
|
10577
|
+
/* .mem_size */ ggml_tensor_overhead()*5,
|
10578
|
+
/* .mem_buffer */ nullptr,
|
10579
|
+
/* .no_alloc */ true,
|
10580
|
+
};
|
10581
|
+
ctx->ctx_input = ggml_init(init_params);
|
10582
|
+
|
10583
|
+
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
10584
|
+
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
10585
|
+
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
10586
|
+
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
10587
|
+
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
10588
|
+
|
10589
|
+
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
10590
|
+
ggml_set_name(ctx->inp_embd, "inp_embd");
|
10591
|
+
ggml_set_name(ctx->inp_pos, "inp_pos");
|
10592
|
+
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
10593
|
+
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
10594
|
+
|
10595
|
+
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
10596
|
+
|
10597
|
+
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
10598
|
+
ggml_backend_buffer_name(ctx->buf_input),
|
10599
|
+
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
10600
|
+
}
|
10601
|
+
|
10602
|
+
// scheduler and compute buffers
|
9521
10603
|
{
|
9522
10604
|
// buffer types used for the compute buffer of each backend
|
9523
10605
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
@@ -9544,9 +10626,6 @@ struct llama_context * llama_new_context_with_model(
|
|
9544
10626
|
|
9545
10627
|
// initialize scheduler with the worst-case graph
|
9546
10628
|
ggml_backend_sched_init_measure(ctx->sched, gf);
|
9547
|
-
// note: the number of splits during measure is higher than during inference due to the kv shift
|
9548
|
-
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
9549
|
-
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
9550
10629
|
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
9551
10630
|
|
9552
10631
|
for (ggml_backend_t backend : ctx->backends) {
|
@@ -9555,6 +10634,10 @@ struct llama_context * llama_new_context_with_model(
|
|
9555
10634
|
ggml_backend_buffer_name(buf),
|
9556
10635
|
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
9557
10636
|
}
|
10637
|
+
|
10638
|
+
// note: the number of splits during measure is higher than during inference due to the kv shift
|
10639
|
+
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
10640
|
+
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
9558
10641
|
}
|
9559
10642
|
}
|
9560
10643
|
|
@@ -10294,22 +11377,24 @@ struct llama_batch llama_batch_get_one(
|
|
10294
11377
|
};
|
10295
11378
|
}
|
10296
11379
|
|
10297
|
-
struct llama_batch llama_batch_init(int32_t
|
11380
|
+
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
|
10298
11381
|
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
10299
11382
|
|
10300
11383
|
if (embd) {
|
10301
|
-
batch.embd = (float *) malloc(sizeof(float) *
|
11384
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
10302
11385
|
} else {
|
10303
|
-
batch.token = (llama_token *) malloc(sizeof(llama_token) *
|
11386
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
10304
11387
|
}
|
10305
11388
|
|
10306
|
-
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) *
|
10307
|
-
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) *
|
10308
|
-
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) *
|
10309
|
-
for (int i = 0; i <
|
11389
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
|
11390
|
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
|
11391
|
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
|
11392
|
+
for (int i = 0; i < n_tokens_alloc; ++i) {
|
10310
11393
|
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
10311
11394
|
}
|
10312
|
-
batch.
|
11395
|
+
batch.seq_id[n_tokens_alloc] = nullptr;
|
11396
|
+
|
11397
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
|
10313
11398
|
|
10314
11399
|
return batch;
|
10315
11400
|
}
|
@@ -10320,7 +11405,7 @@ void llama_batch_free(struct llama_batch batch) {
|
|
10320
11405
|
if (batch.pos) free(batch.pos);
|
10321
11406
|
if (batch.n_seq_id) free(batch.n_seq_id);
|
10322
11407
|
if (batch.seq_id) {
|
10323
|
-
for (int i = 0; i
|
11408
|
+
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
|
10324
11409
|
free(batch.seq_id[i]);
|
10325
11410
|
}
|
10326
11411
|
free(batch.seq_id);
|