llama_cpp 0.12.3 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +22 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -2
- data/vendor/tmp/llama.cpp/Makefile +160 -56
- data/vendor/tmp/llama.cpp/ggml-alloc.c +85 -25
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +115 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +121 -86
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +745 -109
- data/vendor/tmp/llama.cpp/ggml-quants.h +81 -56
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15296 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +51714 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5726 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +39 -0
- data/vendor/tmp/llama.cpp/ggml.c +356 -60
- data/vendor/tmp/llama.cpp/ggml.h +7 -1
- data/vendor/tmp/llama.cpp/llama.cpp +876 -118
- data/vendor/tmp/llama.cpp/llama.h +12 -16
- metadata +9 -2
@@ -11,6 +11,12 @@
|
|
11
11
|
# include "ggml-cuda.h"
|
12
12
|
#elif defined(GGML_USE_CLBLAST)
|
13
13
|
# include "ggml-opencl.h"
|
14
|
+
#elif defined(GGML_USE_VULKAN)
|
15
|
+
# include "ggml-vulkan.h"
|
16
|
+
#elif defined(GGML_USE_SYCL)
|
17
|
+
# include "ggml-sycl.h"
|
18
|
+
#elif defined(GGML_USE_KOMPUTE)
|
19
|
+
# include "ggml-kompute.h"
|
14
20
|
#endif
|
15
21
|
|
16
22
|
#ifdef GGML_USE_METAL
|
@@ -52,6 +58,7 @@
|
|
52
58
|
#include <algorithm>
|
53
59
|
#include <array>
|
54
60
|
#include <cassert>
|
61
|
+
#include <cfloat>
|
55
62
|
#include <cinttypes>
|
56
63
|
#include <climits>
|
57
64
|
#include <cmath>
|
@@ -196,10 +203,13 @@ enum llm_arch {
|
|
196
203
|
LLM_ARCH_PHI2,
|
197
204
|
LLM_ARCH_PLAMO,
|
198
205
|
LLM_ARCH_CODESHELL,
|
206
|
+
LLM_ARCH_ORION,
|
207
|
+
LLM_ARCH_INTERNLM2,
|
208
|
+
LLM_ARCH_MINICPM,
|
199
209
|
LLM_ARCH_UNKNOWN,
|
200
210
|
};
|
201
211
|
|
202
|
-
static std::map<llm_arch,
|
212
|
+
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
203
213
|
{ LLM_ARCH_LLAMA, "llama" },
|
204
214
|
{ LLM_ARCH_FALCON, "falcon" },
|
205
215
|
{ LLM_ARCH_GPT2, "gpt2" },
|
@@ -217,6 +227,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
217
227
|
{ LLM_ARCH_PHI2, "phi2" },
|
218
228
|
{ LLM_ARCH_PLAMO, "plamo" },
|
219
229
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
230
|
+
{ LLM_ARCH_ORION, "orion" },
|
231
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
232
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
220
233
|
};
|
221
234
|
|
222
235
|
enum llm_kv {
|
@@ -269,11 +282,12 @@ enum llm_kv {
|
|
269
282
|
LLM_KV_TOKENIZER_PAD_ID,
|
270
283
|
LLM_KV_TOKENIZER_ADD_BOS,
|
271
284
|
LLM_KV_TOKENIZER_ADD_EOS,
|
285
|
+
LLM_KV_TOKENIZER_ADD_PREFIX,
|
272
286
|
LLM_KV_TOKENIZER_HF_JSON,
|
273
287
|
LLM_KV_TOKENIZER_RWKV,
|
274
288
|
};
|
275
289
|
|
276
|
-
static std::map<llm_kv,
|
290
|
+
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
277
291
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
278
292
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
279
293
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
@@ -323,6 +337,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
323
337
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
324
338
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
325
339
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
340
|
+
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
326
341
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
327
342
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
328
343
|
};
|
@@ -333,7 +348,7 @@ struct LLM_KV {
|
|
333
348
|
llm_arch arch;
|
334
349
|
|
335
350
|
std::string operator()(llm_kv kv) const {
|
336
|
-
return ::format(LLM_KV_NAMES[kv]
|
351
|
+
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
|
337
352
|
}
|
338
353
|
};
|
339
354
|
|
@@ -641,7 +656,65 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
641
656
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
642
657
|
},
|
643
658
|
},
|
644
|
-
|
659
|
+
{
|
660
|
+
LLM_ARCH_ORION,
|
661
|
+
{
|
662
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
663
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
664
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
665
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
666
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
667
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
668
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
669
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
670
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
671
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
672
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
673
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
674
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
675
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
676
|
+
},
|
677
|
+
},
|
678
|
+
{
|
679
|
+
LLM_ARCH_INTERNLM2,
|
680
|
+
{
|
681
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
682
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
683
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
684
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
685
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
686
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
687
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
688
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
689
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
690
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
691
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
692
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
693
|
+
},
|
694
|
+
},
|
695
|
+
{
|
696
|
+
LLM_ARCH_MINICPM,
|
697
|
+
{
|
698
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
699
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
700
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
701
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
702
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
703
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
704
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
705
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
706
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
707
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
708
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
709
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
710
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
711
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
712
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
713
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
714
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
715
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
716
|
+
},
|
717
|
+
},
|
645
718
|
{
|
646
719
|
LLM_ARCH_UNKNOWN,
|
647
720
|
{
|
@@ -699,13 +772,13 @@ struct LLM_TN {
|
|
699
772
|
// gguf helpers
|
700
773
|
//
|
701
774
|
|
702
|
-
static std::map<
|
775
|
+
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
703
776
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
704
777
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
705
778
|
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
706
779
|
};
|
707
780
|
|
708
|
-
static
|
781
|
+
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
709
782
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
710
783
|
if (kv.second == name) {
|
711
784
|
return kv.first;
|
@@ -1132,10 +1205,10 @@ struct llama_mlock {
|
|
1132
1205
|
#ifdef __APPLE__
|
1133
1206
|
#define MLOCK_SUGGESTION \
|
1134
1207
|
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
1135
|
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing
|
1208
|
+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
|
1136
1209
|
#else
|
1137
1210
|
#define MLOCK_SUGGESTION \
|
1138
|
-
"Try increasing
|
1211
|
+
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
|
1139
1212
|
#endif
|
1140
1213
|
|
1141
1214
|
bool raw_lock(const void * addr, size_t size) const {
|
@@ -1256,8 +1329,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1256
1329
|
if (host_buffer) {
|
1257
1330
|
buft = ggml_backend_cuda_host_buffer_type();
|
1258
1331
|
}
|
1332
|
+
#elif defined(GGML_USE_SYCL)
|
1333
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
1259
1334
|
#elif defined(GGML_USE_CPU_HBM)
|
1260
1335
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1336
|
+
#elif defined(GGML_USE_VULKAN)
|
1337
|
+
if (host_buffer) {
|
1338
|
+
buft = ggml_backend_vk_host_buffer_type();
|
1339
|
+
}
|
1261
1340
|
#endif
|
1262
1341
|
|
1263
1342
|
if (buft == nullptr) {
|
@@ -1275,8 +1354,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1275
1354
|
buft = ggml_backend_metal_buffer_type();
|
1276
1355
|
#elif defined(GGML_USE_CUBLAS)
|
1277
1356
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1357
|
+
#elif defined(GGML_USE_VULKAN)
|
1358
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
1359
|
+
#elif defined(GGML_USE_SYCL)
|
1360
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
1278
1361
|
#elif defined(GGML_USE_CLBLAST)
|
1279
1362
|
buft = ggml_backend_opencl_buffer_type();
|
1363
|
+
#elif defined(GGML_USE_KOMPUTE)
|
1364
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
1365
|
+
if (buft == nullptr) {
|
1366
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
1367
|
+
}
|
1280
1368
|
#endif
|
1281
1369
|
|
1282
1370
|
if (buft == nullptr) {
|
@@ -1304,6 +1392,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1304
1392
|
GGML_UNUSED(tensor_split);
|
1305
1393
|
}
|
1306
1394
|
|
1395
|
+
static size_t llama_get_device_count() {
|
1396
|
+
#if defined(GGML_USE_CUBLAS)
|
1397
|
+
return ggml_backend_cuda_get_device_count();
|
1398
|
+
#elif defined(GGML_USE_VULKAN)
|
1399
|
+
return ggml_backend_vk_get_device_count();
|
1400
|
+
#else
|
1401
|
+
return 1;
|
1402
|
+
#endif
|
1403
|
+
}
|
1404
|
+
|
1405
|
+
static size_t llama_get_device_memory(int device) {
|
1406
|
+
#if defined(GGML_USE_CUBLAS)
|
1407
|
+
size_t total;
|
1408
|
+
size_t free;
|
1409
|
+
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
1410
|
+
return free;
|
1411
|
+
#elif defined(GGML_USE_VULKAN)
|
1412
|
+
size_t total;
|
1413
|
+
size_t free;
|
1414
|
+
ggml_backend_vk_get_device_memory(device, &total, &free);
|
1415
|
+
return free;
|
1416
|
+
#else
|
1417
|
+
return 1;
|
1418
|
+
GGML_UNUSED(device);
|
1419
|
+
#endif
|
1420
|
+
}
|
1421
|
+
|
1307
1422
|
//
|
1308
1423
|
// globals
|
1309
1424
|
//
|
@@ -1327,12 +1442,15 @@ enum e_model {
|
|
1327
1442
|
MODEL_UNKNOWN,
|
1328
1443
|
MODEL_0_5B,
|
1329
1444
|
MODEL_1B,
|
1445
|
+
MODEL_2B,
|
1330
1446
|
MODEL_3B,
|
1331
1447
|
MODEL_4B,
|
1332
1448
|
MODEL_7B,
|
1333
1449
|
MODEL_8B,
|
1334
1450
|
MODEL_13B,
|
1451
|
+
MODEL_14B,
|
1335
1452
|
MODEL_15B,
|
1453
|
+
MODEL_20B,
|
1336
1454
|
MODEL_30B,
|
1337
1455
|
MODEL_34B,
|
1338
1456
|
MODEL_40B,
|
@@ -1350,6 +1468,7 @@ static const size_t GiB = 1024*MiB;
|
|
1350
1468
|
|
1351
1469
|
struct llama_hparams {
|
1352
1470
|
bool vocab_only;
|
1471
|
+
bool rope_finetuned;
|
1353
1472
|
uint32_t n_vocab;
|
1354
1473
|
uint32_t n_ctx_train; // context size the model was trained on
|
1355
1474
|
uint32_t n_embd;
|
@@ -1369,8 +1488,7 @@ struct llama_hparams {
|
|
1369
1488
|
float rope_freq_base_train;
|
1370
1489
|
float rope_freq_scale_train;
|
1371
1490
|
uint32_t n_yarn_orig_ctx;
|
1372
|
-
|
1373
|
-
bool rope_finetuned : 1;
|
1491
|
+
int32_t rope_scaling_type_train;
|
1374
1492
|
|
1375
1493
|
float f_clamp_kqv;
|
1376
1494
|
float f_max_alibi_bias;
|
@@ -1574,6 +1692,8 @@ struct llama_vocab {
|
|
1574
1692
|
id special_suffix_id = 32008;
|
1575
1693
|
id special_eot_id = 32010;
|
1576
1694
|
|
1695
|
+
bool add_space_prefix = true;
|
1696
|
+
|
1577
1697
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
1578
1698
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
1579
1699
|
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
@@ -1670,6 +1790,10 @@ struct llama_context {
|
|
1670
1790
|
ggml_backend_free(backend);
|
1671
1791
|
}
|
1672
1792
|
|
1793
|
+
#ifdef GGML_USE_VULKAN
|
1794
|
+
ggml_vk_free_cpu_assist();
|
1795
|
+
#endif
|
1796
|
+
|
1673
1797
|
ggml_backend_buffer_free(buf_input);
|
1674
1798
|
ggml_free(ctx_input);
|
1675
1799
|
}
|
@@ -2323,6 +2447,7 @@ struct llama_model_loader {
|
|
2323
2447
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2324
2448
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2325
2449
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2450
|
+
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2326
2451
|
default:
|
2327
2452
|
{
|
2328
2453
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2633,7 +2758,7 @@ struct llama_model_loader {
|
|
2633
2758
|
// load LLaMA models
|
2634
2759
|
//
|
2635
2760
|
|
2636
|
-
static
|
2761
|
+
static const char * llama_model_arch_name(llm_arch arch) {
|
2637
2762
|
auto it = LLM_ARCH_NAMES.find(arch);
|
2638
2763
|
if (it == LLM_ARCH_NAMES.end()) {
|
2639
2764
|
return "unknown";
|
@@ -2668,9 +2793,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2668
2793
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
2669
2794
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
2670
2795
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2671
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "
|
2796
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
2672
2797
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2673
2798
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
2799
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2674
2800
|
|
2675
2801
|
default: return "unknown, may not work";
|
2676
2802
|
}
|
@@ -2679,11 +2805,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2679
2805
|
static const char * llama_model_type_name(e_model type) {
|
2680
2806
|
switch (type) {
|
2681
2807
|
case MODEL_1B: return "1B";
|
2808
|
+
case MODEL_2B: return "2B";
|
2682
2809
|
case MODEL_3B: return "3B";
|
2683
2810
|
case MODEL_7B: return "7B";
|
2684
2811
|
case MODEL_8B: return "8B";
|
2685
2812
|
case MODEL_13B: return "13B";
|
2813
|
+
case MODEL_14B: return "14B";
|
2686
2814
|
case MODEL_15B: return "15B";
|
2815
|
+
case MODEL_20B: return "20B";
|
2687
2816
|
case MODEL_30B: return "30B";
|
2688
2817
|
case MODEL_34B: return "34B";
|
2689
2818
|
case MODEL_40B: return "40B";
|
@@ -2696,6 +2825,14 @@ static const char * llama_model_type_name(e_model type) {
|
|
2696
2825
|
default: return "?B";
|
2697
2826
|
}
|
2698
2827
|
}
|
2828
|
+
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2829
|
+
switch (type) {
|
2830
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2831
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2832
|
+
default: return "unknown";
|
2833
|
+
}
|
2834
|
+
}
|
2835
|
+
|
2699
2836
|
|
2700
2837
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
2701
2838
|
model.arch = ml.get_arch();
|
@@ -2808,6 +2945,15 @@ static void llm_load_hparams(
|
|
2808
2945
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2809
2946
|
}
|
2810
2947
|
} break;
|
2948
|
+
case LLM_ARCH_MINICPM:
|
2949
|
+
{
|
2950
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2951
|
+
|
2952
|
+
switch (hparams.n_layer) {
|
2953
|
+
case 40: model.type = e_model::MODEL_2B; break;
|
2954
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2955
|
+
}
|
2956
|
+
} break;
|
2811
2957
|
case LLM_ARCH_FALCON:
|
2812
2958
|
{
|
2813
2959
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -2950,7 +3096,24 @@ static void llm_load_hparams(
|
|
2950
3096
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2951
3097
|
}
|
2952
3098
|
} break;
|
3099
|
+
case LLM_ARCH_ORION:
|
3100
|
+
{
|
3101
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2953
3102
|
|
3103
|
+
switch (hparams.n_layer) {
|
3104
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
3105
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3106
|
+
}
|
3107
|
+
} break;
|
3108
|
+
case LLM_ARCH_INTERNLM2:
|
3109
|
+
{
|
3110
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3111
|
+
switch (hparams.n_layer) {
|
3112
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3113
|
+
case 48: model.type = e_model::MODEL_20B; break;
|
3114
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3115
|
+
}
|
3116
|
+
} break;
|
2954
3117
|
default: (void)0;
|
2955
3118
|
}
|
2956
3119
|
|
@@ -3002,6 +3165,11 @@ static void llm_load_vocab(
|
|
3002
3165
|
vocab.special_unk_id = 0;
|
3003
3166
|
vocab.special_sep_id = -1;
|
3004
3167
|
vocab.special_pad_id = -1;
|
3168
|
+
|
3169
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
3170
|
+
if (add_space_prefix_keyidx != -1) {
|
3171
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
3172
|
+
} // The default value of add_space_prefix is true.
|
3005
3173
|
} else if (tokenizer_name == "gpt2") {
|
3006
3174
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
3007
3175
|
|
@@ -3209,12 +3377,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3209
3377
|
const auto & hparams = model.hparams;
|
3210
3378
|
const auto & vocab = model.vocab;
|
3211
3379
|
|
3212
|
-
const
|
3380
|
+
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
3213
3381
|
|
3214
3382
|
// hparams
|
3215
3383
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
3216
|
-
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch)
|
3217
|
-
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type
|
3384
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
|
3385
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
3218
3386
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
3219
3387
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
3220
3388
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
@@ -3235,7 +3403,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3235
3403
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3236
3404
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3237
3405
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3238
|
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type
|
3406
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
3239
3407
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
3240
3408
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
3241
3409
|
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
@@ -3301,22 +3469,18 @@ static bool llm_load_tensors(
|
|
3301
3469
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
3302
3470
|
}
|
3303
3471
|
|
3304
|
-
#ifdef GGML_USE_CUBLAS
|
3305
3472
|
if (split_mode == LLAMA_SPLIT_LAYER) {
|
3306
3473
|
// calculate the split points
|
3307
|
-
int device_count =
|
3474
|
+
int device_count = llama_get_device_count();
|
3308
3475
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
3309
|
-
float splits
|
3476
|
+
std::vector<float> splits(device_count);
|
3310
3477
|
if (all_zero) {
|
3311
3478
|
// default split, by free memory
|
3312
3479
|
for (int i = 0; i < device_count; ++i) {
|
3313
|
-
|
3314
|
-
size_t free;
|
3315
|
-
ggml_backend_cuda_get_device_memory(i, &total, &free);
|
3316
|
-
splits[i] = free;
|
3480
|
+
splits[i] = llama_get_device_memory(i);
|
3317
3481
|
}
|
3318
3482
|
} else {
|
3319
|
-
std::copy(tensor_split, tensor_split + device_count, splits);
|
3483
|
+
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
3320
3484
|
}
|
3321
3485
|
|
3322
3486
|
// sum and normalize the splits to get the split points
|
@@ -3332,19 +3496,17 @@ static bool llm_load_tensors(
|
|
3332
3496
|
// assign the repeating layers to the devices according to the splits
|
3333
3497
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
3334
3498
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
3335
|
-
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
3499
|
+
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
3336
3500
|
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
3337
3501
|
}
|
3338
3502
|
// assign the output layer
|
3339
3503
|
if (n_gpu_layers > n_layer) {
|
3340
|
-
int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
3504
|
+
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
3341
3505
|
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
3342
3506
|
} else {
|
3343
3507
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
3344
3508
|
}
|
3345
|
-
} else
|
3346
|
-
#endif
|
3347
|
-
{
|
3509
|
+
} else {
|
3348
3510
|
ggml_backend_buffer_type_t split_buft;
|
3349
3511
|
if (split_mode == LLAMA_SPLIT_ROW) {
|
3350
3512
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
@@ -3423,13 +3585,16 @@ static bool llm_load_tensors(
|
|
3423
3585
|
switch (model.arch) {
|
3424
3586
|
case LLM_ARCH_LLAMA:
|
3425
3587
|
case LLM_ARCH_REFACT:
|
3588
|
+
case LLM_ARCH_MINICPM:
|
3426
3589
|
{
|
3427
3590
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3428
3591
|
|
3429
3592
|
// output
|
3430
3593
|
{
|
3431
3594
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3432
|
-
model.
|
3595
|
+
if (model.arch != LLM_ARCH_MINICPM){
|
3596
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3597
|
+
}
|
3433
3598
|
}
|
3434
3599
|
|
3435
3600
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -3933,6 +4098,65 @@ static bool llm_load_tensors(
|
|
3933
4098
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3934
4099
|
}
|
3935
4100
|
} break;
|
4101
|
+
case LLM_ARCH_ORION:
|
4102
|
+
{
|
4103
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4104
|
+
{
|
4105
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4106
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4107
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4108
|
+
}
|
4109
|
+
for (int i = 0; i < n_layer; ++i) {
|
4110
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4111
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4112
|
+
|
4113
|
+
auto & layer = model.layers[i];
|
4114
|
+
|
4115
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4116
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4117
|
+
|
4118
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4119
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4120
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4121
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4122
|
+
|
4123
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4124
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4125
|
+
|
4126
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4127
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4128
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4129
|
+
}
|
4130
|
+
} break;
|
4131
|
+
case LLM_ARCH_INTERNLM2:
|
4132
|
+
{
|
4133
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4134
|
+
|
4135
|
+
// output
|
4136
|
+
{
|
4137
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4138
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4139
|
+
}
|
4140
|
+
|
4141
|
+
for (int i = 0; i < n_layer; ++i) {
|
4142
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4143
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4144
|
+
|
4145
|
+
auto & layer = model.layers[i];
|
4146
|
+
|
4147
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4148
|
+
// layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4149
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4150
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4151
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4152
|
+
|
4153
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4154
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4155
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4156
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4157
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4158
|
+
}
|
4159
|
+
} break;
|
3936
4160
|
default:
|
3937
4161
|
throw std::runtime_error("unknown architecture");
|
3938
4162
|
}
|
@@ -3985,8 +4209,7 @@ static bool llm_load_tensors(
|
|
3985
4209
|
ctx_bufs.emplace_back(ctx, buf);
|
3986
4210
|
}
|
3987
4211
|
|
3988
|
-
|
3989
|
-
{
|
4212
|
+
if (llama_supports_gpu_offload()) {
|
3990
4213
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
3991
4214
|
|
3992
4215
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
@@ -3998,10 +4221,11 @@ static bool llm_load_tensors(
|
|
3998
4221
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
3999
4222
|
|
4000
4223
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
4224
|
+
}
|
4001
4225
|
|
4002
|
-
|
4003
|
-
|
4004
|
-
|
4226
|
+
// print memory requirements
|
4227
|
+
for (ggml_backend_buffer_t buf : model.bufs) {
|
4228
|
+
LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
4005
4229
|
}
|
4006
4230
|
|
4007
4231
|
// populate tensors_by_name
|
@@ -4029,7 +4253,7 @@ static bool llm_load_tensors(
|
|
4029
4253
|
}
|
4030
4254
|
|
4031
4255
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
4032
|
-
static int llama_model_load(const std::string & fname, llama_model & model,
|
4256
|
+
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
4033
4257
|
try {
|
4034
4258
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
4035
4259
|
|
@@ -4050,6 +4274,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|
4050
4274
|
return 0;
|
4051
4275
|
}
|
4052
4276
|
|
4277
|
+
#ifdef GGML_USE_KOMPUTE
|
4278
|
+
if (params.n_gpu_layers > 0 && (
|
4279
|
+
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|
4280
|
+
|| !(
|
4281
|
+
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
4282
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
4283
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
4284
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
4285
|
+
)
|
4286
|
+
)) {
|
4287
|
+
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
|
4288
|
+
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
|
4289
|
+
params.n_gpu_layers = 0;
|
4290
|
+
}
|
4291
|
+
#endif
|
4292
|
+
|
4053
4293
|
if (!llm_load_tensors(
|
4054
4294
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
4055
4295
|
params.progress_callback, params.progress_callback_user_data
|
@@ -6366,69 +6606,455 @@ struct llm_build_context {
|
|
6366
6606
|
|
6367
6607
|
return gf;
|
6368
6608
|
}
|
6369
|
-
};
|
6370
|
-
|
6371
|
-
static struct ggml_cgraph * llama_build_graph(
|
6372
|
-
llama_context & lctx,
|
6373
|
-
const llama_batch & batch) {
|
6374
|
-
const auto & model = lctx.model;
|
6375
6609
|
|
6376
|
-
|
6377
|
-
|
6378
|
-
|
6379
|
-
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
6380
|
-
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
6381
|
-
if (il >= 0) {
|
6382
|
-
ggml_format_name(cur, "%s-%d", name, il);
|
6383
|
-
} else {
|
6384
|
-
ggml_set_name(cur, name);
|
6385
|
-
}
|
6610
|
+
struct ggml_cgraph * build_orion() {
|
6611
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6386
6612
|
|
6387
|
-
|
6388
|
-
|
6389
|
-
|
6390
|
-
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
6391
|
-
}
|
6392
|
-
}
|
6393
|
-
};
|
6613
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6614
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6615
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6394
6616
|
|
6395
|
-
|
6617
|
+
struct ggml_tensor * cur;
|
6618
|
+
struct ggml_tensor * inpL;
|
6396
6619
|
|
6397
|
-
|
6620
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6621
|
+
cb(inpL, "inp_embd", -1);
|
6398
6622
|
|
6399
|
-
|
6400
|
-
|
6401
|
-
|
6623
|
+
// inp_pos - contains the positions
|
6624
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6625
|
+
cb(inp_pos, "inp_pos", -1);
|
6402
6626
|
|
6403
|
-
|
6404
|
-
|
6405
|
-
|
6627
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6628
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6629
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6406
6630
|
|
6407
|
-
|
6631
|
+
// shift the entire K-cache if needed
|
6632
|
+
if (do_rope_shift) {
|
6633
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6408
6634
|
}
|
6409
6635
|
|
6410
|
-
|
6411
|
-
|
6412
|
-
const int64_t n_tokens = batch.n_tokens;
|
6636
|
+
for (int il = 0; il < n_layer; ++il) {
|
6637
|
+
struct ggml_tensor * inpSA = inpL;
|
6413
6638
|
|
6414
|
-
|
6415
|
-
|
6639
|
+
// norm
|
6640
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6641
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
6642
|
+
LLM_NORM, cb, il);
|
6643
|
+
cb(cur, "attn_norm", il);
|
6416
6644
|
|
6417
|
-
|
6418
|
-
|
6645
|
+
// self-attention
|
6646
|
+
{
|
6647
|
+
// compute Q and K and RoPE them
|
6648
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6649
|
+
cb(Qcur, "Qcur", il);
|
6650
|
+
// if (model.layers[il].bq) {
|
6651
|
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6652
|
+
// cb(Qcur, "Qcur", il);
|
6653
|
+
// }
|
6419
6654
|
|
6420
|
-
|
6421
|
-
|
6655
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6656
|
+
cb(Kcur, "Kcur", il);
|
6657
|
+
// if (model.layers[il].bk) {
|
6658
|
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6659
|
+
// cb(Kcur, "Kcur", il);
|
6660
|
+
// }
|
6422
6661
|
|
6423
|
-
|
6424
|
-
|
6425
|
-
|
6662
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6663
|
+
cb(Vcur, "Vcur", il);
|
6664
|
+
// if (model.layers[il].bv) {
|
6665
|
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6666
|
+
// cb(Vcur, "Vcur", il);
|
6667
|
+
// }
|
6426
6668
|
|
6427
|
-
|
6428
|
-
|
6669
|
+
Qcur = ggml_rope_custom(
|
6670
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6671
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6672
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6673
|
+
);
|
6674
|
+
cb(Qcur, "Qcur", il);
|
6429
6675
|
|
6430
|
-
|
6431
|
-
|
6676
|
+
Kcur = ggml_rope_custom(
|
6677
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6678
|
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
6679
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6680
|
+
);
|
6681
|
+
cb(Kcur, "Kcur", il);
|
6682
|
+
|
6683
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6684
|
+
model.layers[il].wo, NULL,
|
6685
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6686
|
+
cb(cur, "kqv_out", il);
|
6687
|
+
}
|
6688
|
+
|
6689
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6690
|
+
cb(ffn_inp, "ffn_inp", il);
|
6691
|
+
|
6692
|
+
// feed-forward network
|
6693
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6694
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
6695
|
+
LLM_NORM, cb, il);
|
6696
|
+
cb(cur, "ffn_norm", il);
|
6697
|
+
|
6698
|
+
cur = llm_build_ffn(ctx0, cur,
|
6699
|
+
model.layers[il].ffn_up, NULL,
|
6700
|
+
model.layers[il].ffn_gate, NULL,
|
6701
|
+
model.layers[il].ffn_down, NULL,
|
6702
|
+
NULL,
|
6703
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6704
|
+
cb(cur, "ffn_out", il);
|
6705
|
+
|
6706
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6707
|
+
cb(cur, "l_out", il);
|
6708
|
+
|
6709
|
+
// input for next layer
|
6710
|
+
inpL = cur;
|
6711
|
+
}
|
6712
|
+
|
6713
|
+
cur = inpL;
|
6714
|
+
|
6715
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6716
|
+
model.output_norm, model.output_norm_b,
|
6717
|
+
LLM_NORM, cb, -1);
|
6718
|
+
cb(cur, "result_norm", -1);
|
6719
|
+
|
6720
|
+
// lm_head
|
6721
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6722
|
+
cb(cur, "result_output", -1);
|
6723
|
+
|
6724
|
+
ggml_build_forward_expand(gf, cur);
|
6725
|
+
|
6726
|
+
return gf;
|
6727
|
+
}
|
6728
|
+
|
6729
|
+
struct ggml_cgraph * build_internlm2() {
|
6730
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6731
|
+
|
6732
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6733
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6734
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6735
|
+
|
6736
|
+
struct ggml_tensor * cur;
|
6737
|
+
struct ggml_tensor * inpL;
|
6738
|
+
|
6739
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6740
|
+
cb(inpL, "inp_embd", -1);
|
6741
|
+
|
6742
|
+
// inp_pos - contains the positions
|
6743
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6744
|
+
cb(inp_pos, "inp_pos", -1);
|
6745
|
+
|
6746
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6747
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6748
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6749
|
+
|
6750
|
+
// shift the entire K-cache if needed
|
6751
|
+
if (do_rope_shift) {
|
6752
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6753
|
+
}
|
6754
|
+
|
6755
|
+
for (int il = 0; il < n_layer; ++il) {
|
6756
|
+
struct ggml_tensor * inpSA = inpL;
|
6757
|
+
|
6758
|
+
// norm
|
6759
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6760
|
+
model.layers[il].attn_norm, NULL,
|
6761
|
+
LLM_NORM_RMS, cb, il);
|
6762
|
+
cb(cur, "attn_norm", il);
|
6763
|
+
|
6764
|
+
// self-attention
|
6765
|
+
{
|
6766
|
+
// compute Q and K and RoPE them
|
6767
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6768
|
+
cb(Qcur, "Qcur", il);
|
6769
|
+
if (model.layers[il].bq) {
|
6770
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6771
|
+
cb(Qcur, "Qcur", il);
|
6772
|
+
}
|
6773
|
+
|
6774
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6775
|
+
cb(Kcur, "Kcur", il);
|
6776
|
+
if (model.layers[il].bk) {
|
6777
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6778
|
+
cb(Kcur, "Kcur", il);
|
6779
|
+
}
|
6780
|
+
|
6781
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6782
|
+
cb(Vcur, "Vcur", il);
|
6783
|
+
if (model.layers[il].bv) {
|
6784
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6785
|
+
cb(Vcur, "Vcur", il);
|
6786
|
+
}
|
6787
|
+
|
6788
|
+
Qcur = ggml_rope_custom(
|
6789
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6790
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6791
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6792
|
+
);
|
6793
|
+
cb(Qcur, "Qcur", il);
|
6794
|
+
|
6795
|
+
Kcur = ggml_rope_custom(
|
6796
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6797
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6798
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6799
|
+
);
|
6800
|
+
cb(Kcur, "Kcur", il);
|
6801
|
+
|
6802
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6803
|
+
model.layers[il].wo, model.layers[il].bo,
|
6804
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6805
|
+
cb(cur, "kqv_out", il);
|
6806
|
+
}
|
6807
|
+
|
6808
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6809
|
+
cb(ffn_inp, "ffn_inp", il);
|
6810
|
+
|
6811
|
+
// feed-forward network
|
6812
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6813
|
+
model.layers[il].ffn_norm, NULL,
|
6814
|
+
LLM_NORM_RMS, cb, il);
|
6815
|
+
cb(cur, "ffn_norm", il);
|
6816
|
+
|
6817
|
+
cur = llm_build_ffn(ctx0, cur,
|
6818
|
+
model.layers[il].ffn_up, NULL,
|
6819
|
+
model.layers[il].ffn_gate, NULL,
|
6820
|
+
model.layers[il].ffn_down, NULL,
|
6821
|
+
NULL,
|
6822
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6823
|
+
cb(cur, "ffn_out", il);
|
6824
|
+
|
6825
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6826
|
+
cb(cur, "l_out", il);
|
6827
|
+
|
6828
|
+
// input for next layer
|
6829
|
+
inpL = cur;
|
6830
|
+
}
|
6831
|
+
|
6832
|
+
cur = inpL;
|
6833
|
+
|
6834
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6835
|
+
model.output_norm, NULL,
|
6836
|
+
LLM_NORM_RMS, cb, -1);
|
6837
|
+
cb(cur, "result_norm", -1);
|
6838
|
+
|
6839
|
+
// lm_head
|
6840
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
6841
|
+
cb(cur, "result_output", -1);
|
6842
|
+
|
6843
|
+
ggml_build_forward_expand(gf, cur);
|
6844
|
+
|
6845
|
+
return gf;
|
6846
|
+
}
|
6847
|
+
|
6848
|
+
// ref: https://arxiv.org/abs/2203.03466
|
6849
|
+
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
6850
|
+
// based on the original build_llama() function
|
6851
|
+
struct ggml_cgraph * build_minicpm() {
|
6852
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6853
|
+
|
6854
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
6855
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
6856
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6857
|
+
|
6858
|
+
const int64_t n_embd = hparams.n_embd;
|
6859
|
+
//TODO: if the model varies, these parameters need to be read from the model
|
6860
|
+
const int64_t n_embd_base = 256;
|
6861
|
+
const float scale_embd = 12.0f;
|
6862
|
+
const float scale_depth = 1.4f;
|
6863
|
+
|
6864
|
+
struct ggml_tensor * cur;
|
6865
|
+
struct ggml_tensor * inpL;
|
6866
|
+
|
6867
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
6868
|
+
cb(inpL, "inp_embd", -1);
|
6869
|
+
|
6870
|
+
// scale the input embeddings
|
6871
|
+
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
6872
|
+
cb(inpL, "inp_scaled", -1);
|
6873
|
+
|
6874
|
+
// inp_pos - contains the positions
|
6875
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
6876
|
+
cb(inp_pos, "inp_pos", -1);
|
6877
|
+
|
6878
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6879
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6880
|
+
cb(KQ_mask, "KQ_mask", -1);
|
6881
|
+
|
6882
|
+
// shift the entire K-cache if needed
|
6883
|
+
if (do_rope_shift) {
|
6884
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6885
|
+
}
|
6886
|
+
|
6887
|
+
for (int il = 0; il < n_layer; ++il) {
|
6888
|
+
struct ggml_tensor * inpSA = inpL;
|
6889
|
+
|
6890
|
+
// norm
|
6891
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
6892
|
+
model.layers[il].attn_norm, NULL,
|
6893
|
+
LLM_NORM_RMS, cb, il);
|
6894
|
+
cb(cur, "attn_norm", il);
|
6895
|
+
|
6896
|
+
// self-attention
|
6897
|
+
{
|
6898
|
+
// compute Q and K and RoPE them
|
6899
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
6900
|
+
cb(Qcur, "Qcur", il);
|
6901
|
+
if (model.layers[il].bq) {
|
6902
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
6903
|
+
cb(Qcur, "Qcur", il);
|
6904
|
+
}
|
6905
|
+
|
6906
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
6907
|
+
cb(Kcur, "Kcur", il);
|
6908
|
+
if (model.layers[il].bk) {
|
6909
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
6910
|
+
cb(Kcur, "Kcur", il);
|
6911
|
+
}
|
6912
|
+
|
6913
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
6914
|
+
cb(Vcur, "Vcur", il);
|
6915
|
+
if (model.layers[il].bv) {
|
6916
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
6917
|
+
cb(Vcur, "Vcur", il);
|
6918
|
+
}
|
6919
|
+
|
6920
|
+
Qcur = ggml_rope_custom(
|
6921
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6922
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6923
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6924
|
+
);
|
6925
|
+
cb(Qcur, "Qcur", il);
|
6926
|
+
|
6927
|
+
Kcur = ggml_rope_custom(
|
6928
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6929
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
6930
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
6931
|
+
);
|
6932
|
+
cb(Kcur, "Kcur", il);
|
6933
|
+
|
6934
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6935
|
+
model.layers[il].wo, model.layers[il].bo,
|
6936
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6937
|
+
cb(cur, "kqv_out", il);
|
6938
|
+
}
|
6939
|
+
|
6940
|
+
// scale_res - scale the hidden states for residual connection
|
6941
|
+
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
6942
|
+
cur = ggml_scale(ctx0, cur, scale_res);
|
6943
|
+
cb(cur, "hidden_scaled", -1);
|
6944
|
+
|
6945
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
6946
|
+
cb(ffn_inp, "ffn_inp", il);
|
6947
|
+
|
6948
|
+
// feed-forward network
|
6949
|
+
{
|
6950
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
6951
|
+
model.layers[il].ffn_norm, NULL,
|
6952
|
+
LLM_NORM_RMS, cb, il);
|
6953
|
+
cb(cur, "ffn_norm", il);
|
6954
|
+
|
6955
|
+
cur = llm_build_ffn(ctx0, cur,
|
6956
|
+
model.layers[il].ffn_up, NULL,
|
6957
|
+
model.layers[il].ffn_gate, NULL,
|
6958
|
+
model.layers[il].ffn_down, NULL,
|
6959
|
+
NULL,
|
6960
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
6961
|
+
cb(cur, "ffn_out", il);
|
6962
|
+
}
|
6963
|
+
|
6964
|
+
// scale the hidden states for residual connection
|
6965
|
+
cur = ggml_scale(ctx0, cur, scale_res);
|
6966
|
+
cb(cur, "hidden_scaled_ffn", -1);
|
6967
|
+
|
6968
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
6969
|
+
cb(cur, "l_out", il);
|
6970
|
+
|
6971
|
+
// input for next layer
|
6972
|
+
inpL = cur;
|
6973
|
+
}
|
6974
|
+
|
6975
|
+
cur = inpL;
|
6976
|
+
|
6977
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
6978
|
+
model.output_norm, NULL,
|
6979
|
+
LLM_NORM_RMS, cb, -1);
|
6980
|
+
cb(cur, "result_norm", -1);
|
6981
|
+
|
6982
|
+
// lm_head scaling
|
6983
|
+
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
6984
|
+
cur = ggml_scale(ctx0, cur, scale_lmhead);
|
6985
|
+
cb(cur, "lmhead_scaling", -1);
|
6986
|
+
|
6987
|
+
// lm_head
|
6988
|
+
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
6989
|
+
cb(cur, "result_output", -1);
|
6990
|
+
|
6991
|
+
ggml_build_forward_expand(gf, cur);
|
6992
|
+
|
6993
|
+
return gf;
|
6994
|
+
}
|
6995
|
+
};
|
6996
|
+
|
6997
|
+
static struct ggml_cgraph * llama_build_graph(
|
6998
|
+
llama_context & lctx,
|
6999
|
+
const llama_batch & batch) {
|
7000
|
+
const auto & model = lctx.model;
|
7001
|
+
|
7002
|
+
// check if we should build the worst-case graph (for memory measurement)
|
7003
|
+
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
7004
|
+
|
7005
|
+
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
7006
|
+
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
7007
|
+
if (il >= 0) {
|
7008
|
+
ggml_format_name(cur, "%s-%d", name, il);
|
7009
|
+
} else {
|
7010
|
+
ggml_set_name(cur, name);
|
7011
|
+
}
|
7012
|
+
|
7013
|
+
if (!lctx.cparams.offload_kqv) {
|
7014
|
+
if (strcmp(name, "kqv_merged_cont") == 0) {
|
7015
|
+
// all nodes between the KV store and the attention output are run on the CPU
|
7016
|
+
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
7017
|
+
}
|
7018
|
+
}
|
7019
|
+
};
|
7020
|
+
|
7021
|
+
struct ggml_cgraph * result = NULL;
|
7022
|
+
|
7023
|
+
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
7024
|
+
|
7025
|
+
//
|
7026
|
+
// set input data
|
7027
|
+
//
|
7028
|
+
|
7029
|
+
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
7030
|
+
if (batch.token) {
|
7031
|
+
const int64_t n_tokens = batch.n_tokens;
|
7032
|
+
|
7033
|
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7034
|
+
}
|
7035
|
+
|
7036
|
+
if (batch.embd) {
|
7037
|
+
const int64_t n_embd = llm.n_embd;
|
7038
|
+
const int64_t n_tokens = batch.n_tokens;
|
7039
|
+
|
7040
|
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7041
|
+
}
|
7042
|
+
|
7043
|
+
if (batch.pos) {
|
7044
|
+
const int64_t n_tokens = batch.n_tokens;
|
7045
|
+
|
7046
|
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7047
|
+
}
|
7048
|
+
|
7049
|
+
{
|
7050
|
+
const int64_t n_kv = llm.n_kv;
|
7051
|
+
const int64_t n_tokens = batch.n_tokens;
|
7052
|
+
|
7053
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7054
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
7055
|
+
|
7056
|
+
for (int h = 0; h < 1; ++h) {
|
7057
|
+
for (int j = 0; j < n_tokens; ++j) {
|
6432
7058
|
const llama_pos pos = batch.pos[j];
|
6433
7059
|
const llama_seq_id seq_id = batch.seq_id[j][0];
|
6434
7060
|
|
@@ -6520,6 +7146,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6520
7146
|
{
|
6521
7147
|
result = llm.build_codeshell();
|
6522
7148
|
} break;
|
7149
|
+
case LLM_ARCH_ORION:
|
7150
|
+
{
|
7151
|
+
result = llm.build_orion();
|
7152
|
+
} break;
|
7153
|
+
case LLM_ARCH_INTERNLM2:
|
7154
|
+
{
|
7155
|
+
result = llm.build_internlm2();
|
7156
|
+
} break;
|
7157
|
+
case LLM_ARCH_MINICPM:
|
7158
|
+
{
|
7159
|
+
result = llm.build_minicpm();
|
7160
|
+
} break;
|
6523
7161
|
default:
|
6524
7162
|
GGML_ASSERT(false);
|
6525
7163
|
}
|
@@ -6651,11 +7289,6 @@ static int llama_decode_internal(
|
|
6651
7289
|
n_threads = std::min(4, n_threads);
|
6652
7290
|
}
|
6653
7291
|
|
6654
|
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
6655
|
-
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
6656
|
-
n_threads = 1;
|
6657
|
-
}
|
6658
|
-
|
6659
7292
|
#ifdef GGML_USE_MPI
|
6660
7293
|
const int64_t n_layer = hparams.n_layer;
|
6661
7294
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
@@ -7467,7 +8100,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7467
8100
|
//
|
7468
8101
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
7469
8102
|
if (&fragment == &fragment_buffer.front()) {
|
7470
|
-
|
8103
|
+
if (vocab.add_space_prefix) {
|
8104
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
8105
|
+
}
|
7471
8106
|
}
|
7472
8107
|
|
7473
8108
|
#ifdef PRETOKENIZERDEBUG
|
@@ -7946,8 +8581,17 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
|
7946
8581
|
}
|
7947
8582
|
|
7948
8583
|
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
8584
|
+
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
8585
|
+
// if (k >= (int32_t)candidates->size) {
|
8586
|
+
// return;
|
8587
|
+
// }
|
8588
|
+
|
7949
8589
|
const int64_t t_start_sample_us = ggml_time_us();
|
7950
8590
|
|
8591
|
+
if (k <= 0) {
|
8592
|
+
k = candidates->size;
|
8593
|
+
}
|
8594
|
+
|
7951
8595
|
k = std::max(k, (int) min_keep);
|
7952
8596
|
k = std::min(k, (int) candidates->size);
|
7953
8597
|
|
@@ -8054,21 +8698,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
|
8054
8698
|
return;
|
8055
8699
|
}
|
8056
8700
|
|
8057
|
-
llama_sample_softmax(ctx, candidates);
|
8058
|
-
|
8059
8701
|
const int64_t t_start_sample_us = ggml_time_us();
|
8060
8702
|
|
8061
|
-
|
8062
|
-
|
8703
|
+
bool min_p_applied = false;
|
8704
|
+
|
8705
|
+
// if the candidates aren't sorted, try the unsorted implementation first
|
8706
|
+
if (!candidates->sorted) {
|
8707
|
+
std::vector<llama_token_data> filtered_tokens;
|
8708
|
+
|
8709
|
+
float max_logit = -FLT_MAX;
|
8710
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
8711
|
+
max_logit = std::max(max_logit, candidates->data[i].logit);
|
8712
|
+
}
|
8713
|
+
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
8714
|
+
|
8715
|
+
for (size_t i = 0; i < candidates->size; ++i) {
|
8716
|
+
if (candidates->data[i].logit >= min_logit) {
|
8717
|
+
filtered_tokens.push_back(candidates->data[i]);
|
8718
|
+
}
|
8719
|
+
}
|
8063
8720
|
|
8064
|
-
|
8065
|
-
if (
|
8066
|
-
|
8721
|
+
// if we have enough values the operation was a success
|
8722
|
+
if (filtered_tokens.size() >= min_keep) {
|
8723
|
+
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
8724
|
+
candidates->size = filtered_tokens.size();
|
8725
|
+
min_p_applied = true;
|
8067
8726
|
}
|
8068
8727
|
}
|
8069
8728
|
|
8070
|
-
//
|
8071
|
-
|
8729
|
+
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
8730
|
+
if (!min_p_applied) {
|
8731
|
+
// Sort the logits in descending order
|
8732
|
+
if (!candidates->sorted) {
|
8733
|
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
8734
|
+
return a.logit > b.logit;
|
8735
|
+
});
|
8736
|
+
candidates->sorted = true;
|
8737
|
+
}
|
8738
|
+
|
8739
|
+
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
8740
|
+
size_t i = 1; // first token always matches
|
8741
|
+
|
8742
|
+
for (; i < candidates->size; ++i) {
|
8743
|
+
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
8744
|
+
break; // prob too small
|
8745
|
+
}
|
8746
|
+
}
|
8747
|
+
|
8748
|
+
// Resize the output vector to keep only the matching tokens
|
8749
|
+
candidates->size = i;
|
8750
|
+
}
|
8072
8751
|
|
8073
8752
|
if (ctx) {
|
8074
8753
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -8972,6 +9651,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8972
9651
|
else if (new_type != GGML_TYPE_Q8_0) {
|
8973
9652
|
new_type = GGML_TYPE_Q6_K;
|
8974
9653
|
}
|
9654
|
+
} else if (name == "token_embd.weight") {
|
9655
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
9656
|
+
new_type = GGML_TYPE_Q2_K;
|
9657
|
+
}
|
9658
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
9659
|
+
new_type = GGML_TYPE_Q4_K;
|
9660
|
+
}
|
8975
9661
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
8976
9662
|
if (name.find("attn_v.weight") != std::string::npos) {
|
8977
9663
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
@@ -8982,7 +9668,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8982
9668
|
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
8983
9669
|
++qs.i_ffn_down;
|
8984
9670
|
}
|
8985
|
-
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
8986
9671
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
8987
9672
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
8988
9673
|
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
@@ -8990,6 +9675,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8990
9675
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
8991
9676
|
new_type = GGML_TYPE_Q4_K;
|
8992
9677
|
}
|
9678
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
9679
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
|
9680
|
+
}
|
8993
9681
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
8994
9682
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
8995
9683
|
}
|
@@ -9027,6 +9715,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9027
9715
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
9028
9716
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
9029
9717
|
}
|
9718
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
9719
|
+
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
9720
|
+
}
|
9030
9721
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
9031
9722
|
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
9032
9723
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
@@ -9058,13 +9749,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9058
9749
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
9059
9750
|
if (arch != LLM_ARCH_FALCON) {
|
9060
9751
|
if (qs.model.hparams.n_expert == 8) {
|
9061
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
|
9752
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
9062
9753
|
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
9063
9754
|
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
9064
9755
|
new_type = GGML_TYPE_Q5_K;
|
9065
9756
|
}
|
9066
9757
|
} else {
|
9067
9758
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
9759
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
9068
9760
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
9069
9761
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
9070
9762
|
}
|
@@ -9107,7 +9799,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9107
9799
|
bool convert_incompatible_tensor = false;
|
9108
9800
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
9109
9801
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
9110
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS
|
9802
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
9803
|
+
new_type == GGML_TYPE_IQ3_XXS) {
|
9111
9804
|
int nx = tensor->ne[0];
|
9112
9805
|
int ny = tensor->ne[1];
|
9113
9806
|
if (nx % QK_K != 0) {
|
@@ -9121,6 +9814,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9121
9814
|
switch (new_type) {
|
9122
9815
|
case GGML_TYPE_IQ2_XXS:
|
9123
9816
|
case GGML_TYPE_IQ2_XS:
|
9817
|
+
case GGML_TYPE_IQ3_XXS:
|
9124
9818
|
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
9125
9819
|
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
9126
9820
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
@@ -9162,6 +9856,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9162
9856
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
9163
9857
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9164
9858
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
9859
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
9165
9860
|
|
9166
9861
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9167
9862
|
}
|
@@ -9812,18 +10507,47 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
9812
10507
|
return result;
|
9813
10508
|
}
|
9814
10509
|
|
9815
|
-
|
9816
|
-
|
10510
|
+
size_t llama_max_devices(void) {
|
10511
|
+
#if defined(GGML_USE_METAL)
|
10512
|
+
return 1;
|
10513
|
+
#elif defined(GGML_USE_CUBLAS)
|
10514
|
+
return GGML_CUDA_MAX_DEVICES;
|
10515
|
+
#elif defined(GGML_USE_SYCL)
|
10516
|
+
return GGML_SYCL_MAX_DEVICES;
|
10517
|
+
#elif defined(GGML_USE_VULKAN)
|
10518
|
+
return GGML_VK_MAX_DEVICES;
|
10519
|
+
#else
|
10520
|
+
return 1;
|
10521
|
+
#endif
|
9817
10522
|
}
|
9818
10523
|
|
9819
|
-
bool
|
10524
|
+
bool llama_supports_mmap(void) {
|
9820
10525
|
return llama_mmap::SUPPORTED;
|
9821
10526
|
}
|
9822
10527
|
|
9823
|
-
bool
|
10528
|
+
bool llama_supports_mlock(void) {
|
9824
10529
|
return llama_mlock::SUPPORTED;
|
9825
10530
|
}
|
9826
10531
|
|
10532
|
+
bool llama_supports_gpu_offload(void) {
|
10533
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
10534
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
10535
|
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
10536
|
+
return true;
|
10537
|
+
#else
|
10538
|
+
return false;
|
10539
|
+
#endif
|
10540
|
+
}
|
10541
|
+
|
10542
|
+
// deprecated:
|
10543
|
+
bool llama_mmap_supported(void) {
|
10544
|
+
return llama_supports_mmap();
|
10545
|
+
}
|
10546
|
+
|
10547
|
+
bool llama_mlock_supported(void) {
|
10548
|
+
return llama_supports_mlock();
|
10549
|
+
}
|
10550
|
+
|
9827
10551
|
void llama_backend_init(bool numa) {
|
9828
10552
|
ggml_time_init();
|
9829
10553
|
|
@@ -9855,8 +10579,8 @@ int64_t llama_time_us(void) {
|
|
9855
10579
|
}
|
9856
10580
|
|
9857
10581
|
struct llama_model * llama_load_model_from_file(
|
9858
|
-
|
9859
|
-
|
10582
|
+
const char * path_model,
|
10583
|
+
struct llama_model_params params) {
|
9860
10584
|
ggml_time_init();
|
9861
10585
|
|
9862
10586
|
llama_model * model = new llama_model;
|
@@ -9997,6 +10721,38 @@ struct llama_context * llama_new_context_with_model(
|
|
9997
10721
|
}
|
9998
10722
|
}
|
9999
10723
|
}
|
10724
|
+
#elif defined(GGML_USE_VULKAN)
|
10725
|
+
if (model->n_gpu_layers > 0) {
|
10726
|
+
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
10727
|
+
ggml_backend_t backend = ggml_backend_vk_init(device);
|
10728
|
+
if (backend == nullptr) {
|
10729
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
10730
|
+
llama_free(ctx);
|
10731
|
+
return nullptr;
|
10732
|
+
}
|
10733
|
+
ctx->backends.push_back(backend);
|
10734
|
+
}
|
10735
|
+
}
|
10736
|
+
#elif defined(GGML_USE_SYCL)
|
10737
|
+
if (model->n_gpu_layers > 0) {
|
10738
|
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
10739
|
+
if (backend == nullptr) {
|
10740
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
10741
|
+
llama_free(ctx);
|
10742
|
+
return nullptr;
|
10743
|
+
}
|
10744
|
+
ctx->backends.push_back(backend);
|
10745
|
+
}
|
10746
|
+
#elif defined(GGML_USE_KOMPUTE)
|
10747
|
+
if (model->n_gpu_layers > 0) {
|
10748
|
+
auto * backend = ggml_backend_kompute_init(model->main_gpu);
|
10749
|
+
if (backend == nullptr) {
|
10750
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
10751
|
+
llama_free(ctx);
|
10752
|
+
return nullptr;
|
10753
|
+
}
|
10754
|
+
ctx->backends.push_back(backend);
|
10755
|
+
}
|
10000
10756
|
#endif
|
10001
10757
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
10002
10758
|
if (ctx->backend_cpu == nullptr) {
|
@@ -10202,7 +10958,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
|
10202
10958
|
|
10203
10959
|
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
10204
10960
|
return snprintf(buf, buf_size, "%s %s %s",
|
10205
|
-
llama_model_arch_name(model->arch)
|
10961
|
+
llama_model_arch_name(model->arch),
|
10206
10962
|
llama_model_type_name(model->type),
|
10207
10963
|
llama_model_ftype_name(model->ftype).c_str());
|
10208
10964
|
}
|
@@ -10844,22 +11600,24 @@ struct llama_batch llama_batch_get_one(
|
|
10844
11600
|
};
|
10845
11601
|
}
|
10846
11602
|
|
10847
|
-
struct llama_batch llama_batch_init(int32_t
|
11603
|
+
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
|
10848
11604
|
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
10849
11605
|
|
10850
11606
|
if (embd) {
|
10851
|
-
batch.embd = (float *) malloc(sizeof(float) *
|
11607
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
10852
11608
|
} else {
|
10853
|
-
batch.token = (llama_token *) malloc(sizeof(llama_token) *
|
11609
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
10854
11610
|
}
|
10855
11611
|
|
10856
|
-
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) *
|
10857
|
-
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) *
|
10858
|
-
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) *
|
10859
|
-
for (int i = 0; i <
|
11612
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
|
11613
|
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
|
11614
|
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
|
11615
|
+
for (int i = 0; i < n_tokens_alloc; ++i) {
|
10860
11616
|
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
10861
11617
|
}
|
10862
|
-
batch.
|
11618
|
+
batch.seq_id[n_tokens_alloc] = nullptr;
|
11619
|
+
|
11620
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
|
10863
11621
|
|
10864
11622
|
return batch;
|
10865
11623
|
}
|
@@ -10870,7 +11628,7 @@ void llama_batch_free(struct llama_batch batch) {
|
|
10870
11628
|
if (batch.pos) free(batch.pos);
|
10871
11629
|
if (batch.n_seq_id) free(batch.n_seq_id);
|
10872
11630
|
if (batch.seq_id) {
|
10873
|
-
for (int i = 0; i
|
11631
|
+
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
|
10874
11632
|
free(batch.seq_id[i]);
|
10875
11633
|
}
|
10876
11634
|
free(batch.seq_id);
|