llama_cpp 0.12.4 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +146 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +386 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +68 -59
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +139 -145
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1516 -10656
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1777 -1238
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +14 -9
- data/vendor/tmp/llama.cpp/ggml.c +147 -70
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +920 -173
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -196,6 +196,7 @@ enum llm_arch {
|
|
196
196
|
LLM_ARCH_STARCODER,
|
197
197
|
LLM_ARCH_PERSIMMON,
|
198
198
|
LLM_ARCH_REFACT,
|
199
|
+
LLM_ARCH_BERT,
|
199
200
|
LLM_ARCH_BLOOM,
|
200
201
|
LLM_ARCH_STABLELM,
|
201
202
|
LLM_ARCH_QWEN,
|
@@ -205,10 +206,11 @@ enum llm_arch {
|
|
205
206
|
LLM_ARCH_CODESHELL,
|
206
207
|
LLM_ARCH_ORION,
|
207
208
|
LLM_ARCH_INTERNLM2,
|
209
|
+
LLM_ARCH_MINICPM,
|
208
210
|
LLM_ARCH_UNKNOWN,
|
209
211
|
};
|
210
212
|
|
211
|
-
static std::map<llm_arch,
|
213
|
+
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
212
214
|
{ LLM_ARCH_LLAMA, "llama" },
|
213
215
|
{ LLM_ARCH_FALCON, "falcon" },
|
214
216
|
{ LLM_ARCH_GPT2, "gpt2" },
|
@@ -219,6 +221,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
219
221
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
220
222
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
221
223
|
{ LLM_ARCH_REFACT, "refact" },
|
224
|
+
{ LLM_ARCH_BERT, "bert" },
|
222
225
|
{ LLM_ARCH_BLOOM, "bloom" },
|
223
226
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
224
227
|
{ LLM_ARCH_QWEN, "qwen" },
|
@@ -228,6 +231,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
228
231
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
229
232
|
{ LLM_ARCH_ORION, "orion" },
|
230
233
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
234
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
231
235
|
};
|
232
236
|
|
233
237
|
enum llm_kv {
|
@@ -250,6 +254,7 @@ enum llm_kv {
|
|
250
254
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
251
255
|
LLM_KV_EXPERT_COUNT,
|
252
256
|
LLM_KV_EXPERT_USED_COUNT,
|
257
|
+
LLM_KV_POOLING_LAYER,
|
253
258
|
|
254
259
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
255
260
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
@@ -259,6 +264,7 @@ enum llm_kv {
|
|
259
264
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
260
265
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
261
266
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
267
|
+
LLM_KV_ATTENTION_CAUSAL,
|
262
268
|
|
263
269
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
264
270
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -271,6 +277,7 @@ enum llm_kv {
|
|
271
277
|
LLM_KV_TOKENIZER_MODEL,
|
272
278
|
LLM_KV_TOKENIZER_LIST,
|
273
279
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
280
|
+
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
274
281
|
LLM_KV_TOKENIZER_SCORES,
|
275
282
|
LLM_KV_TOKENIZER_MERGES,
|
276
283
|
LLM_KV_TOKENIZER_BOS_ID,
|
@@ -285,7 +292,7 @@ enum llm_kv {
|
|
285
292
|
LLM_KV_TOKENIZER_RWKV,
|
286
293
|
};
|
287
294
|
|
288
|
-
static std::map<llm_kv,
|
295
|
+
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
289
296
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
290
297
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
291
298
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
@@ -305,6 +312,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
305
312
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
306
313
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
307
314
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
315
|
+
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
|
308
316
|
|
309
317
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
310
318
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -314,6 +322,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
314
322
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
315
323
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
316
324
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
325
|
+
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
317
326
|
|
318
327
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
319
328
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -326,6 +335,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
326
335
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
327
336
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
328
337
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
338
|
+
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
329
339
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
330
340
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
331
341
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
@@ -346,13 +356,14 @@ struct LLM_KV {
|
|
346
356
|
llm_arch arch;
|
347
357
|
|
348
358
|
std::string operator()(llm_kv kv) const {
|
349
|
-
return ::format(LLM_KV_NAMES[kv]
|
359
|
+
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
|
350
360
|
}
|
351
361
|
};
|
352
362
|
|
353
363
|
enum llm_tensor {
|
354
364
|
LLM_TENSOR_TOKEN_EMBD,
|
355
365
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
366
|
+
LLM_TENSOR_TOKEN_TYPES,
|
356
367
|
LLM_TENSOR_POS_EMBD,
|
357
368
|
LLM_TENSOR_OUTPUT,
|
358
369
|
LLM_TENSOR_OUTPUT_NORM,
|
@@ -534,6 +545,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
534
545
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
535
546
|
},
|
536
547
|
},
|
548
|
+
{
|
549
|
+
LLM_ARCH_BERT,
|
550
|
+
{
|
551
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
552
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
553
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
554
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
555
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
|
556
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
557
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
558
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
559
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
560
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
|
561
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
562
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
563
|
+
},
|
564
|
+
},
|
537
565
|
{
|
538
566
|
LLM_ARCH_BLOOM,
|
539
567
|
{
|
@@ -690,6 +718,29 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
690
718
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
691
719
|
},
|
692
720
|
},
|
721
|
+
{
|
722
|
+
LLM_ARCH_MINICPM,
|
723
|
+
{
|
724
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
725
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
726
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
727
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
728
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
729
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
730
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
731
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
732
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
733
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
734
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
735
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
736
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
737
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
738
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
739
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
740
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
741
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
742
|
+
},
|
743
|
+
},
|
693
744
|
{
|
694
745
|
LLM_ARCH_UNKNOWN,
|
695
746
|
{
|
@@ -723,22 +774,37 @@ struct LLM_TN {
|
|
723
774
|
llm_arch arch;
|
724
775
|
|
725
776
|
std::string operator()(llm_tensor tensor) const {
|
777
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
778
|
+
return "__missing__";
|
779
|
+
}
|
726
780
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
727
781
|
}
|
728
782
|
|
729
783
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
784
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
785
|
+
return "__missing__";
|
786
|
+
}
|
730
787
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
731
788
|
}
|
732
789
|
|
733
790
|
std::string operator()(llm_tensor tensor, int bid) const {
|
791
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
792
|
+
return "__missing__";
|
793
|
+
}
|
734
794
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
735
795
|
}
|
736
796
|
|
737
797
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
798
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
799
|
+
return "__missing__";
|
800
|
+
}
|
738
801
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
739
802
|
}
|
740
803
|
|
741
804
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
805
|
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
806
|
+
return "__missing__";
|
807
|
+
}
|
742
808
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
743
809
|
}
|
744
810
|
};
|
@@ -747,13 +813,13 @@ struct LLM_TN {
|
|
747
813
|
// gguf helpers
|
748
814
|
//
|
749
815
|
|
750
|
-
static std::map<
|
816
|
+
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
751
817
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
752
818
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
753
819
|
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
754
820
|
};
|
755
821
|
|
756
|
-
static
|
822
|
+
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
757
823
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
758
824
|
if (kv.second == name) {
|
759
825
|
return kv.first;
|
@@ -1330,7 +1396,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1330
1396
|
#elif defined(GGML_USE_CUBLAS)
|
1331
1397
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
1332
1398
|
#elif defined(GGML_USE_VULKAN)
|
1333
|
-
buft = ggml_backend_vk_buffer_type();
|
1399
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
1334
1400
|
#elif defined(GGML_USE_SYCL)
|
1335
1401
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
1336
1402
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1367,6 +1433,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1367
1433
|
GGML_UNUSED(tensor_split);
|
1368
1434
|
}
|
1369
1435
|
|
1436
|
+
static size_t llama_get_device_count() {
|
1437
|
+
#if defined(GGML_USE_CUBLAS)
|
1438
|
+
return ggml_backend_cuda_get_device_count();
|
1439
|
+
#elif defined(GGML_USE_VULKAN)
|
1440
|
+
return ggml_backend_vk_get_device_count();
|
1441
|
+
#else
|
1442
|
+
return 1;
|
1443
|
+
#endif
|
1444
|
+
}
|
1445
|
+
|
1446
|
+
static size_t llama_get_device_memory(int device) {
|
1447
|
+
#if defined(GGML_USE_CUBLAS)
|
1448
|
+
size_t total;
|
1449
|
+
size_t free;
|
1450
|
+
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
1451
|
+
return free;
|
1452
|
+
#elif defined(GGML_USE_VULKAN)
|
1453
|
+
size_t total;
|
1454
|
+
size_t free;
|
1455
|
+
ggml_backend_vk_get_device_memory(device, &total, &free);
|
1456
|
+
return free;
|
1457
|
+
#else
|
1458
|
+
return 1;
|
1459
|
+
GGML_UNUSED(device);
|
1460
|
+
#endif
|
1461
|
+
}
|
1462
|
+
|
1370
1463
|
//
|
1371
1464
|
// globals
|
1372
1465
|
//
|
@@ -1388,8 +1481,14 @@ static llama_state g_state;
|
|
1388
1481
|
// available llama models
|
1389
1482
|
enum e_model {
|
1390
1483
|
MODEL_UNKNOWN,
|
1484
|
+
MODEL_17M,
|
1485
|
+
MODEL_22M,
|
1486
|
+
MODEL_33M,
|
1487
|
+
MODEL_109M,
|
1488
|
+
MODEL_335M,
|
1391
1489
|
MODEL_0_5B,
|
1392
1490
|
MODEL_1B,
|
1491
|
+
MODEL_2B,
|
1393
1492
|
MODEL_3B,
|
1394
1493
|
MODEL_4B,
|
1395
1494
|
MODEL_7B,
|
@@ -1415,6 +1514,7 @@ static const size_t GiB = 1024*MiB;
|
|
1415
1514
|
|
1416
1515
|
struct llama_hparams {
|
1417
1516
|
bool vocab_only;
|
1517
|
+
bool rope_finetuned;
|
1418
1518
|
uint32_t n_vocab;
|
1419
1519
|
uint32_t n_ctx_train; // context size the model was trained on
|
1420
1520
|
uint32_t n_embd;
|
@@ -1427,6 +1527,7 @@ struct llama_hparams {
|
|
1427
1527
|
uint32_t n_ff;
|
1428
1528
|
uint32_t n_expert = 0;
|
1429
1529
|
uint32_t n_expert_used = 0;
|
1530
|
+
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1430
1531
|
|
1431
1532
|
float f_norm_eps;
|
1432
1533
|
float f_norm_rms_eps;
|
@@ -1434,12 +1535,14 @@ struct llama_hparams {
|
|
1434
1535
|
float rope_freq_base_train;
|
1435
1536
|
float rope_freq_scale_train;
|
1436
1537
|
uint32_t n_yarn_orig_ctx;
|
1437
|
-
|
1438
|
-
bool rope_finetuned : 1;
|
1538
|
+
int32_t rope_scaling_type_train;
|
1439
1539
|
|
1440
1540
|
float f_clamp_kqv;
|
1441
1541
|
float f_max_alibi_bias;
|
1442
1542
|
|
1543
|
+
bool causal_attn = true;
|
1544
|
+
bool pooling_layer = false;
|
1545
|
+
|
1443
1546
|
|
1444
1547
|
bool operator!=(const llama_hparams & other) const {
|
1445
1548
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1501,6 +1604,7 @@ struct llama_cparams {
|
|
1501
1604
|
|
1502
1605
|
bool mul_mat_q;
|
1503
1606
|
bool offload_kqv;
|
1607
|
+
bool do_pooling;
|
1504
1608
|
|
1505
1609
|
ggml_backend_sched_eval_callback cb_eval;
|
1506
1610
|
void * cb_eval_user_data;
|
@@ -1667,6 +1771,7 @@ struct llama_model {
|
|
1667
1771
|
llama_vocab vocab;
|
1668
1772
|
|
1669
1773
|
struct ggml_tensor * tok_embd;
|
1774
|
+
struct ggml_tensor * type_embd;
|
1670
1775
|
struct ggml_tensor * pos_embd;
|
1671
1776
|
struct ggml_tensor * tok_norm;
|
1672
1777
|
struct ggml_tensor * tok_norm_b;
|
@@ -1737,6 +1842,10 @@ struct llama_context {
|
|
1737
1842
|
ggml_backend_free(backend);
|
1738
1843
|
}
|
1739
1844
|
|
1845
|
+
#ifdef GGML_USE_VULKAN
|
1846
|
+
ggml_vk_free_cpu_assist();
|
1847
|
+
#endif
|
1848
|
+
|
1740
1849
|
ggml_backend_buffer_free(buf_input);
|
1741
1850
|
ggml_free(ctx_input);
|
1742
1851
|
}
|
@@ -1782,8 +1891,6 @@ struct llama_context {
|
|
1782
1891
|
// memory buffers used to evaluate the model
|
1783
1892
|
std::vector<uint8_t> buf_compute_meta;
|
1784
1893
|
ggml_backend_sched_t sched = nullptr;
|
1785
|
-
// allocator for the input tensors
|
1786
|
-
ggml_tallocr * alloc = nullptr;
|
1787
1894
|
|
1788
1895
|
// input tensors
|
1789
1896
|
ggml_backend_buffer_t buf_input = nullptr;
|
@@ -1793,6 +1900,7 @@ struct llama_context {
|
|
1793
1900
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1794
1901
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
1795
1902
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
1903
|
+
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
|
1796
1904
|
|
1797
1905
|
#ifdef GGML_USE_MPI
|
1798
1906
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -2701,7 +2809,7 @@ struct llama_model_loader {
|
|
2701
2809
|
// load LLaMA models
|
2702
2810
|
//
|
2703
2811
|
|
2704
|
-
static
|
2812
|
+
static const char * llama_model_arch_name(llm_arch arch) {
|
2705
2813
|
auto it = LLM_ARCH_NAMES.find(arch);
|
2706
2814
|
if (it == LLM_ARCH_NAMES.end()) {
|
2707
2815
|
return "unknown";
|
@@ -2748,6 +2856,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2748
2856
|
static const char * llama_model_type_name(e_model type) {
|
2749
2857
|
switch (type) {
|
2750
2858
|
case MODEL_1B: return "1B";
|
2859
|
+
case MODEL_2B: return "2B";
|
2751
2860
|
case MODEL_3B: return "3B";
|
2752
2861
|
case MODEL_7B: return "7B";
|
2753
2862
|
case MODEL_8B: return "8B";
|
@@ -2771,6 +2880,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
2771
2880
|
switch (type) {
|
2772
2881
|
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2773
2882
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2883
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
2774
2884
|
default: return "unknown";
|
2775
2885
|
}
|
2776
2886
|
}
|
@@ -2887,6 +2997,15 @@ static void llm_load_hparams(
|
|
2887
2997
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2888
2998
|
}
|
2889
2999
|
} break;
|
3000
|
+
case LLM_ARCH_MINICPM:
|
3001
|
+
{
|
3002
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3003
|
+
|
3004
|
+
switch (hparams.n_layer) {
|
3005
|
+
case 40: model.type = e_model::MODEL_2B; break;
|
3006
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3007
|
+
}
|
3008
|
+
} break;
|
2890
3009
|
case LLM_ARCH_FALCON:
|
2891
3010
|
{
|
2892
3011
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -2933,6 +3052,27 @@ static void llm_load_hparams(
|
|
2933
3052
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2934
3053
|
}
|
2935
3054
|
} break;
|
3055
|
+
case LLM_ARCH_BERT:
|
3056
|
+
{
|
3057
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3058
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3059
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3060
|
+
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
3061
|
+
|
3062
|
+
switch (hparams.n_layer) {
|
3063
|
+
case 3:
|
3064
|
+
model.type = e_model::MODEL_17M; break; // bge-micro
|
3065
|
+
case 6:
|
3066
|
+
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
3067
|
+
case 12:
|
3068
|
+
switch (hparams.n_embd) {
|
3069
|
+
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
3070
|
+
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
3071
|
+
} break;
|
3072
|
+
case 24:
|
3073
|
+
model.type = e_model::MODEL_335M; break; // bge-large
|
3074
|
+
}
|
3075
|
+
} break;
|
2936
3076
|
case LLM_ARCH_BLOOM:
|
2937
3077
|
{
|
2938
3078
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3137,6 +3277,16 @@ static void llm_load_vocab(
|
|
3137
3277
|
vocab.special_unk_id = -1;
|
3138
3278
|
vocab.special_sep_id = -1;
|
3139
3279
|
vocab.special_pad_id = -1;
|
3280
|
+
} else if (tokenizer_name == "bert") {
|
3281
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
3282
|
+
|
3283
|
+
// default special tokens
|
3284
|
+
vocab.special_bos_id = 101;
|
3285
|
+
vocab.special_eos_id = 102;
|
3286
|
+
vocab.special_unk_id = 100;
|
3287
|
+
vocab.special_sep_id = -1;
|
3288
|
+
vocab.special_pad_id = -1;
|
3289
|
+
vocab.add_space_prefix = false;
|
3140
3290
|
} else {
|
3141
3291
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
3142
3292
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
@@ -3164,7 +3314,14 @@ static void llm_load_vocab(
|
|
3164
3314
|
|
3165
3315
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
3166
3316
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
3167
|
-
|
3317
|
+
try {
|
3318
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
3319
|
+
} catch (const std::exception & e) {
|
3320
|
+
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
3321
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3322
|
+
}
|
3323
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
3324
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
3168
3325
|
} else {
|
3169
3326
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
3170
3327
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
@@ -3310,11 +3467,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3310
3467
|
const auto & hparams = model.hparams;
|
3311
3468
|
const auto & vocab = model.vocab;
|
3312
3469
|
|
3313
|
-
const
|
3470
|
+
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
3314
3471
|
|
3315
3472
|
// hparams
|
3316
3473
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
3317
|
-
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch)
|
3474
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
|
3318
3475
|
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
3319
3476
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
3320
3477
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
@@ -3336,7 +3493,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3336
3493
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3337
3494
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3338
3495
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3339
|
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type
|
3496
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
3340
3497
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
3341
3498
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
3342
3499
|
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
@@ -3402,22 +3559,18 @@ static bool llm_load_tensors(
|
|
3402
3559
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
3403
3560
|
}
|
3404
3561
|
|
3405
|
-
#ifdef GGML_USE_CUBLAS
|
3406
3562
|
if (split_mode == LLAMA_SPLIT_LAYER) {
|
3407
3563
|
// calculate the split points
|
3408
|
-
int device_count =
|
3564
|
+
int device_count = llama_get_device_count();
|
3409
3565
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
3410
|
-
float splits
|
3566
|
+
std::vector<float> splits(device_count);
|
3411
3567
|
if (all_zero) {
|
3412
3568
|
// default split, by free memory
|
3413
3569
|
for (int i = 0; i < device_count; ++i) {
|
3414
|
-
|
3415
|
-
size_t free;
|
3416
|
-
ggml_backend_cuda_get_device_memory(i, &total, &free);
|
3417
|
-
splits[i] = free;
|
3570
|
+
splits[i] = llama_get_device_memory(i);
|
3418
3571
|
}
|
3419
3572
|
} else {
|
3420
|
-
std::copy(tensor_split, tensor_split + device_count, splits);
|
3573
|
+
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
3421
3574
|
}
|
3422
3575
|
|
3423
3576
|
// sum and normalize the splits to get the split points
|
@@ -3433,19 +3586,17 @@ static bool llm_load_tensors(
|
|
3433
3586
|
// assign the repeating layers to the devices according to the splits
|
3434
3587
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
3435
3588
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
3436
|
-
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
3589
|
+
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
3437
3590
|
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
3438
3591
|
}
|
3439
3592
|
// assign the output layer
|
3440
3593
|
if (n_gpu_layers > n_layer) {
|
3441
|
-
int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
3594
|
+
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
3442
3595
|
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
3443
3596
|
} else {
|
3444
3597
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
3445
3598
|
}
|
3446
|
-
} else
|
3447
|
-
#endif
|
3448
|
-
{
|
3599
|
+
} else {
|
3449
3600
|
ggml_backend_buffer_type_t split_buft;
|
3450
3601
|
if (split_mode == LLAMA_SPLIT_ROW) {
|
3451
3602
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
@@ -3508,6 +3659,7 @@ static bool llm_load_tensors(
|
|
3508
3659
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
3509
3660
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3510
3661
|
const int64_t n_vocab = hparams.n_vocab;
|
3662
|
+
const int64_t n_vocab_type = hparams.n_vocab_type;
|
3511
3663
|
const int64_t n_ff = hparams.n_ff;
|
3512
3664
|
|
3513
3665
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
@@ -3524,13 +3676,16 @@ static bool llm_load_tensors(
|
|
3524
3676
|
switch (model.arch) {
|
3525
3677
|
case LLM_ARCH_LLAMA:
|
3526
3678
|
case LLM_ARCH_REFACT:
|
3679
|
+
case LLM_ARCH_MINICPM:
|
3527
3680
|
{
|
3528
3681
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3529
3682
|
|
3530
3683
|
// output
|
3531
3684
|
{
|
3532
3685
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3533
|
-
model.
|
3686
|
+
if (model.arch != LLM_ARCH_MINICPM){
|
3687
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3688
|
+
}
|
3534
3689
|
}
|
3535
3690
|
|
3536
3691
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -3719,11 +3874,50 @@ static bool llm_load_tensors(
|
|
3719
3874
|
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
3720
3875
|
}
|
3721
3876
|
} break;
|
3722
|
-
case
|
3877
|
+
case LLM_ARCH_BERT:
|
3723
3878
|
{
|
3724
3879
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3725
|
-
model.
|
3726
|
-
model.
|
3880
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
3881
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3882
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3883
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3884
|
+
|
3885
|
+
for (int i = 0; i < n_layer; ++i) {
|
3886
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3887
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3888
|
+
|
3889
|
+
auto & layer = model.layers[i];
|
3890
|
+
|
3891
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3892
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3893
|
+
|
3894
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3895
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3896
|
+
|
3897
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3898
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3899
|
+
|
3900
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3901
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3902
|
+
|
3903
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3904
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3905
|
+
|
3906
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3907
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3908
|
+
|
3909
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3910
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3911
|
+
|
3912
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3913
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3914
|
+
}
|
3915
|
+
} break;
|
3916
|
+
case LLM_ARCH_BLOOM:
|
3917
|
+
{
|
3918
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3919
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3920
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3727
3921
|
|
3728
3922
|
// output
|
3729
3923
|
{
|
@@ -4145,8 +4339,7 @@ static bool llm_load_tensors(
|
|
4145
4339
|
ctx_bufs.emplace_back(ctx, buf);
|
4146
4340
|
}
|
4147
4341
|
|
4148
|
-
|
4149
|
-
{
|
4342
|
+
if (llama_supports_gpu_offload()) {
|
4150
4343
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
4151
4344
|
|
4152
4345
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
@@ -4158,10 +4351,11 @@ static bool llm_load_tensors(
|
|
4158
4351
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
4159
4352
|
|
4160
4353
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
4354
|
+
}
|
4161
4355
|
|
4162
|
-
|
4163
|
-
|
4164
|
-
|
4356
|
+
// print memory requirements
|
4357
|
+
for (ggml_backend_buffer_t buf : model.bufs) {
|
4358
|
+
LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
4165
4359
|
}
|
4166
4360
|
|
4167
4361
|
// populate tensors_by_name
|
@@ -4195,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4195
4389
|
|
4196
4390
|
model.hparams.vocab_only = params.vocab_only;
|
4197
4391
|
|
4198
|
-
|
4199
|
-
|
4200
|
-
|
4392
|
+
try {
|
4393
|
+
llm_load_arch(ml, model);
|
4394
|
+
} catch(const std::exception & e) {
|
4395
|
+
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
4396
|
+
}
|
4397
|
+
try {
|
4398
|
+
llm_load_hparams(ml, model);
|
4399
|
+
} catch(const std::exception & e) {
|
4400
|
+
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
4401
|
+
}
|
4402
|
+
try {
|
4403
|
+
llm_load_vocab(ml, model);
|
4404
|
+
} catch(const std::exception & e) {
|
4405
|
+
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
4406
|
+
}
|
4201
4407
|
|
4202
4408
|
llm_load_print_meta(ml, model);
|
4203
4409
|
|
@@ -4675,6 +4881,7 @@ struct llm_build_context {
|
|
4675
4881
|
const int32_t n_orig_ctx;
|
4676
4882
|
|
4677
4883
|
const bool do_rope_shift;
|
4884
|
+
const bool do_pooling;
|
4678
4885
|
|
4679
4886
|
const llm_build_cb & cb;
|
4680
4887
|
|
@@ -4718,6 +4925,7 @@ struct llm_build_context {
|
|
4718
4925
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
4719
4926
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
4720
4927
|
do_rope_shift (worst_case || kv_self.has_shift),
|
4928
|
+
do_pooling (hparams.pooling_layer && cparams.do_pooling),
|
4721
4929
|
cb (cb),
|
4722
4930
|
buf_compute_meta (lctx.buf_compute_meta) {
|
4723
4931
|
// all initializations should be done in init()
|
@@ -5561,6 +5769,103 @@ struct llm_build_context {
|
|
5561
5769
|
return gf;
|
5562
5770
|
}
|
5563
5771
|
|
5772
|
+
struct ggml_cgraph * build_bert() {
|
5773
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5774
|
+
|
5775
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5776
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5777
|
+
|
5778
|
+
struct ggml_tensor * cur;
|
5779
|
+
struct ggml_tensor * inpL;
|
5780
|
+
|
5781
|
+
// get input vectors with right size
|
5782
|
+
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5783
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5784
|
+
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
|
5785
|
+
|
5786
|
+
// construct input embeddings (token, type, position)
|
5787
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
5788
|
+
|
5789
|
+
// token types are hardcoded to zero ("Sentence A")
|
5790
|
+
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
5791
|
+
inpL = ggml_add(ctx0, inpL, type_row0);
|
5792
|
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
5793
|
+
cb(inpL, "inp_embd", -1);
|
5794
|
+
|
5795
|
+
// embed layer norm
|
5796
|
+
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
5797
|
+
cb(inpL, "inp_norm", -1);
|
5798
|
+
|
5799
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5800
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5801
|
+
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
5802
|
+
|
5803
|
+
// iterate layers
|
5804
|
+
for (int il = 0; il < n_layer; ++il) {
|
5805
|
+
struct ggml_tensor * cur = inpL;
|
5806
|
+
|
5807
|
+
// self-attention
|
5808
|
+
{
|
5809
|
+
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5810
|
+
cb(Qcur, "Qcur", il);
|
5811
|
+
|
5812
|
+
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
5813
|
+
cb(Kcur, "Kcur", il);
|
5814
|
+
|
5815
|
+
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
5816
|
+
cb(Vcur, "Vcur", il);
|
5817
|
+
|
5818
|
+
// seems like we just need to do this for Q?
|
5819
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5820
|
+
|
5821
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5822
|
+
model.layers[il].wo, model.layers[il].bo,
|
5823
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5824
|
+
cb(cur, "kqv_out", il);
|
5825
|
+
}
|
5826
|
+
|
5827
|
+
// re-add the layer input
|
5828
|
+
cur = ggml_add(ctx0, cur, inpL);
|
5829
|
+
|
5830
|
+
// attention layer norm
|
5831
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
|
5832
|
+
|
5833
|
+
struct ggml_tensor * ffn_inp = cur;
|
5834
|
+
cb(ffn_inp, "ffn_inp", il);
|
5835
|
+
|
5836
|
+
// feed-forward network
|
5837
|
+
cur = llm_build_ffn(ctx0, cur,
|
5838
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
5839
|
+
NULL, NULL,
|
5840
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
5841
|
+
NULL,
|
5842
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
5843
|
+
cb(cur, "ffn_out", il);
|
5844
|
+
|
5845
|
+
// attentions bypass the intermediate layer
|
5846
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5847
|
+
|
5848
|
+
// output layer norm
|
5849
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
|
5850
|
+
|
5851
|
+
// input for next layer
|
5852
|
+
inpL = cur;
|
5853
|
+
}
|
5854
|
+
|
5855
|
+
// final output
|
5856
|
+
cur = inpL;
|
5857
|
+
|
5858
|
+
// pooling layer
|
5859
|
+
if (do_pooling) {
|
5860
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
|
5861
|
+
}
|
5862
|
+
cb(cur, "result_embd", -1);
|
5863
|
+
|
5864
|
+
ggml_build_forward_expand(gf, cur);
|
5865
|
+
|
5866
|
+
return gf;
|
5867
|
+
}
|
5868
|
+
|
5564
5869
|
struct ggml_cgraph * build_bloom() {
|
5565
5870
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5566
5871
|
|
@@ -6781,96 +7086,180 @@ struct llm_build_context {
|
|
6781
7086
|
return gf;
|
6782
7087
|
}
|
6783
7088
|
|
6784
|
-
|
7089
|
+
// ref: https://arxiv.org/abs/2203.03466
|
7090
|
+
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
7091
|
+
// based on the original build_llama() function
|
7092
|
+
struct ggml_cgraph * build_minicpm() {
|
7093
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
6785
7094
|
|
6786
|
-
|
6787
|
-
|
6788
|
-
|
6789
|
-
const auto & model = lctx.model;
|
7095
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7096
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7097
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
6790
7098
|
|
6791
|
-
|
6792
|
-
|
7099
|
+
const int64_t n_embd = hparams.n_embd;
|
7100
|
+
//TODO: if the model varies, these parameters need to be read from the model
|
7101
|
+
const int64_t n_embd_base = 256;
|
7102
|
+
const float scale_embd = 12.0f;
|
7103
|
+
const float scale_depth = 1.4f;
|
6793
7104
|
|
6794
|
-
|
6795
|
-
|
6796
|
-
|
6797
|
-
|
6798
|
-
|
6799
|
-
|
7105
|
+
struct ggml_tensor * cur;
|
7106
|
+
struct ggml_tensor * inpL;
|
7107
|
+
|
7108
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
7109
|
+
cb(inpL, "inp_embd", -1);
|
7110
|
+
|
7111
|
+
// scale the input embeddings
|
7112
|
+
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
7113
|
+
cb(inpL, "inp_scaled", -1);
|
7114
|
+
|
7115
|
+
// inp_pos - contains the positions
|
7116
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
7117
|
+
cb(inp_pos, "inp_pos", -1);
|
7118
|
+
|
7119
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7120
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7121
|
+
cb(KQ_mask, "KQ_mask", -1);
|
7122
|
+
|
7123
|
+
// shift the entire K-cache if needed
|
7124
|
+
if (do_rope_shift) {
|
7125
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6800
7126
|
}
|
6801
7127
|
|
6802
|
-
|
6803
|
-
|
6804
|
-
|
6805
|
-
|
7128
|
+
for (int il = 0; il < n_layer; ++il) {
|
7129
|
+
struct ggml_tensor * inpSA = inpL;
|
7130
|
+
|
7131
|
+
// norm
|
7132
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7133
|
+
model.layers[il].attn_norm, NULL,
|
7134
|
+
LLM_NORM_RMS, cb, il);
|
7135
|
+
cb(cur, "attn_norm", il);
|
7136
|
+
|
7137
|
+
// self-attention
|
7138
|
+
{
|
7139
|
+
// compute Q and K and RoPE them
|
7140
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
7141
|
+
cb(Qcur, "Qcur", il);
|
7142
|
+
if (model.layers[il].bq) {
|
7143
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
7144
|
+
cb(Qcur, "Qcur", il);
|
7145
|
+
}
|
7146
|
+
|
7147
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
7148
|
+
cb(Kcur, "Kcur", il);
|
7149
|
+
if (model.layers[il].bk) {
|
7150
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
7151
|
+
cb(Kcur, "Kcur", il);
|
7152
|
+
}
|
7153
|
+
|
7154
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7155
|
+
cb(Vcur, "Vcur", il);
|
7156
|
+
if (model.layers[il].bv) {
|
7157
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
7158
|
+
cb(Vcur, "Vcur", il);
|
7159
|
+
}
|
7160
|
+
|
7161
|
+
Qcur = ggml_rope_custom(
|
7162
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7163
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
7164
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7165
|
+
);
|
7166
|
+
cb(Qcur, "Qcur", il);
|
7167
|
+
|
7168
|
+
Kcur = ggml_rope_custom(
|
7169
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7170
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
7171
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7172
|
+
);
|
7173
|
+
cb(Kcur, "Kcur", il);
|
7174
|
+
|
7175
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7176
|
+
model.layers[il].wo, model.layers[il].bo,
|
7177
|
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7178
|
+
cb(cur, "kqv_out", il);
|
6806
7179
|
}
|
6807
|
-
}
|
6808
|
-
};
|
6809
7180
|
|
6810
|
-
|
7181
|
+
// scale_res - scale the hidden states for residual connection
|
7182
|
+
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
7183
|
+
cur = ggml_scale(ctx0, cur, scale_res);
|
7184
|
+
cb(cur, "hidden_scaled", -1);
|
6811
7185
|
|
6812
|
-
|
7186
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7187
|
+
cb(ffn_inp, "ffn_inp", il);
|
6813
7188
|
|
6814
|
-
|
6815
|
-
|
6816
|
-
|
7189
|
+
// feed-forward network
|
7190
|
+
{
|
7191
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7192
|
+
model.layers[il].ffn_norm, NULL,
|
7193
|
+
LLM_NORM_RMS, cb, il);
|
7194
|
+
cb(cur, "ffn_norm", il);
|
6817
7195
|
|
6818
|
-
|
6819
|
-
|
6820
|
-
|
7196
|
+
cur = llm_build_ffn(ctx0, cur,
|
7197
|
+
model.layers[il].ffn_up, NULL,
|
7198
|
+
model.layers[il].ffn_gate, NULL,
|
7199
|
+
model.layers[il].ffn_down, NULL,
|
7200
|
+
NULL,
|
7201
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
7202
|
+
cb(cur, "ffn_out", il);
|
7203
|
+
}
|
6821
7204
|
|
6822
|
-
|
6823
|
-
|
7205
|
+
// scale the hidden states for residual connection
|
7206
|
+
cur = ggml_scale(ctx0, cur, scale_res);
|
7207
|
+
cb(cur, "hidden_scaled_ffn", -1);
|
6824
7208
|
|
6825
|
-
|
6826
|
-
|
6827
|
-
const int64_t n_tokens = batch.n_tokens;
|
7209
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7210
|
+
cb(cur, "l_out", il);
|
6828
7211
|
|
6829
|
-
|
7212
|
+
// input for next layer
|
7213
|
+
inpL = cur;
|
6830
7214
|
}
|
6831
7215
|
|
6832
|
-
|
6833
|
-
const int64_t n_tokens = batch.n_tokens;
|
7216
|
+
cur = inpL;
|
6834
7217
|
|
6835
|
-
|
6836
|
-
|
7218
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7219
|
+
model.output_norm, NULL,
|
7220
|
+
LLM_NORM_RMS, cb, -1);
|
7221
|
+
cb(cur, "result_norm", -1);
|
6837
7222
|
|
6838
|
-
|
6839
|
-
|
6840
|
-
|
7223
|
+
// lm_head scaling
|
7224
|
+
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
7225
|
+
cur = ggml_scale(ctx0, cur, scale_lmhead);
|
7226
|
+
cb(cur, "lmhead_scaling", -1);
|
6841
7227
|
|
6842
|
-
|
6843
|
-
|
7228
|
+
// lm_head
|
7229
|
+
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
7230
|
+
cb(cur, "result_output", -1);
|
6844
7231
|
|
6845
|
-
|
6846
|
-
for (int j = 0; j < n_tokens; ++j) {
|
6847
|
-
const llama_pos pos = batch.pos[j];
|
6848
|
-
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7232
|
+
ggml_build_forward_expand(gf, cur);
|
6849
7233
|
|
6850
|
-
|
6851
|
-
|
6852
|
-
|
6853
|
-
f = -INFINITY;
|
6854
|
-
} else {
|
6855
|
-
f = 0;
|
6856
|
-
}
|
6857
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
6858
|
-
}
|
6859
|
-
}
|
6860
|
-
}
|
6861
|
-
}
|
7234
|
+
return gf;
|
7235
|
+
}
|
7236
|
+
};
|
6862
7237
|
|
6863
|
-
|
6864
|
-
|
7238
|
+
static struct ggml_cgraph * llama_build_graph(
|
7239
|
+
llama_context & lctx,
|
7240
|
+
const llama_batch & batch,
|
7241
|
+
bool worst_case) {
|
7242
|
+
const auto & model = lctx.model;
|
6865
7243
|
|
6866
|
-
|
6867
|
-
|
7244
|
+
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
7245
|
+
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
7246
|
+
if (il >= 0) {
|
7247
|
+
ggml_format_name(cur, "%s-%d", name, il);
|
7248
|
+
} else {
|
7249
|
+
ggml_set_name(cur, name);
|
7250
|
+
}
|
6868
7251
|
|
6869
|
-
|
6870
|
-
|
7252
|
+
if (!lctx.cparams.offload_kqv) {
|
7253
|
+
if (strcmp(name, "kqv_merged_cont") == 0) {
|
7254
|
+
// all nodes between the KV store and the attention output are run on the CPU
|
7255
|
+
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
6871
7256
|
}
|
6872
7257
|
}
|
6873
|
-
}
|
7258
|
+
};
|
7259
|
+
|
7260
|
+
struct ggml_cgraph * result = NULL;
|
7261
|
+
|
7262
|
+
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
6874
7263
|
|
6875
7264
|
llm.init();
|
6876
7265
|
|
@@ -6899,6 +7288,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6899
7288
|
{
|
6900
7289
|
result = llm.build_refact();
|
6901
7290
|
} break;
|
7291
|
+
case LLM_ARCH_BERT:
|
7292
|
+
{
|
7293
|
+
result = llm.build_bert();
|
7294
|
+
} break;
|
6902
7295
|
case LLM_ARCH_BLOOM:
|
6903
7296
|
{
|
6904
7297
|
result = llm.build_bloom();
|
@@ -6943,6 +7336,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6943
7336
|
{
|
6944
7337
|
result = llm.build_internlm2();
|
6945
7338
|
} break;
|
7339
|
+
case LLM_ARCH_MINICPM:
|
7340
|
+
{
|
7341
|
+
result = llm.build_minicpm();
|
7342
|
+
} break;
|
6946
7343
|
default:
|
6947
7344
|
GGML_ASSERT(false);
|
6948
7345
|
}
|
@@ -6952,6 +7349,97 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6952
7349
|
return result;
|
6953
7350
|
}
|
6954
7351
|
|
7352
|
+
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7353
|
+
//
|
7354
|
+
// set input data
|
7355
|
+
//
|
7356
|
+
|
7357
|
+
const auto & hparams = lctx.model.hparams;
|
7358
|
+
const auto & cparams = lctx.cparams;
|
7359
|
+
const auto & kv_self = lctx.kv_self;
|
7360
|
+
|
7361
|
+
if (batch.token) {
|
7362
|
+
const int64_t n_tokens = batch.n_tokens;
|
7363
|
+
|
7364
|
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
7365
|
+
}
|
7366
|
+
|
7367
|
+
if (batch.embd) {
|
7368
|
+
const int64_t n_embd = hparams.n_embd;
|
7369
|
+
const int64_t n_tokens = batch.n_tokens;
|
7370
|
+
|
7371
|
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7372
|
+
}
|
7373
|
+
|
7374
|
+
if (batch.pos) {
|
7375
|
+
const int64_t n_tokens = batch.n_tokens;
|
7376
|
+
|
7377
|
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7378
|
+
}
|
7379
|
+
|
7380
|
+
{
|
7381
|
+
const int64_t n_kv = kv_self.n;
|
7382
|
+
const int64_t n_tokens = batch.n_tokens;
|
7383
|
+
|
7384
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7385
|
+
|
7386
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
7387
|
+
|
7388
|
+
for (int h = 0; h < 1; ++h) {
|
7389
|
+
for (int j = 0; j < n_tokens; ++j) {
|
7390
|
+
const llama_pos pos = batch.pos[j];
|
7391
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
7392
|
+
|
7393
|
+
for (int i = 0; i < n_kv; ++i) {
|
7394
|
+
float f;
|
7395
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
7396
|
+
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
7397
|
+
f = -INFINITY;
|
7398
|
+
} else {
|
7399
|
+
f = 0;
|
7400
|
+
}
|
7401
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7402
|
+
}
|
7403
|
+
}
|
7404
|
+
}
|
7405
|
+
}
|
7406
|
+
|
7407
|
+
{
|
7408
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
7409
|
+
float * data = (float *) lctx.inp_sum->data;
|
7410
|
+
|
7411
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
7412
|
+
data[i] = 1.0f/float(batch.n_tokens);
|
7413
|
+
}
|
7414
|
+
}
|
7415
|
+
|
7416
|
+
if (kv_self.has_shift) {
|
7417
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7418
|
+
|
7419
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7420
|
+
|
7421
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7422
|
+
|
7423
|
+
for (int i = 0; i < n_ctx; ++i) {
|
7424
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
7425
|
+
}
|
7426
|
+
}
|
7427
|
+
|
7428
|
+
if (hparams.pooling_layer && cparams.do_pooling) {
|
7429
|
+
const int64_t n_tokens = batch.n_tokens;
|
7430
|
+
|
7431
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
7432
|
+
float * data = (float *) lctx.inp_sum->data;
|
7433
|
+
|
7434
|
+
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
|
7435
|
+
|
7436
|
+
for (int i = 0; i < n_tokens; ++i) {
|
7437
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7438
|
+
data[seq_id*n_tokens + i] = 1.0f;
|
7439
|
+
}
|
7440
|
+
}
|
7441
|
+
}
|
7442
|
+
|
6955
7443
|
// decode a batch of tokens by evaluating the transformer
|
6956
7444
|
//
|
6957
7445
|
// - lctx: llama context
|
@@ -7050,17 +7538,22 @@ static int llama_decode_internal(
|
|
7050
7538
|
ggml_backend_sched_reset(lctx.sched);
|
7051
7539
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
7052
7540
|
|
7053
|
-
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
7541
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7054
7542
|
|
7055
7543
|
// the output is always the last tensor in the graph
|
7056
7544
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
7057
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
7058
|
-
|
7059
|
-
// the embeddings could be the second to last tensor, or the third to last tensor
|
7060
7545
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
7061
|
-
if (strcmp(
|
7062
|
-
embeddings
|
7063
|
-
|
7546
|
+
if (strcmp(res->name, "result_output") == 0) {
|
7547
|
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
7548
|
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
7549
|
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
7550
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
7551
|
+
}
|
7552
|
+
} else if (strcmp(res->name, "result_embd") == 0) {
|
7553
|
+
embeddings = res;
|
7554
|
+
res = nullptr;
|
7555
|
+
} else {
|
7556
|
+
GGML_ASSERT(false);
|
7064
7557
|
}
|
7065
7558
|
|
7066
7559
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -7070,7 +7563,9 @@ static int llama_decode_internal(
|
|
7070
7563
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
7071
7564
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
7072
7565
|
// with the BLAS calls. need a better solution
|
7073
|
-
|
7566
|
+
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
7567
|
+
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
7568
|
+
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
7074
7569
|
n_threads = std::min(4, n_threads);
|
7075
7570
|
}
|
7076
7571
|
|
@@ -7088,6 +7583,9 @@ static int llama_decode_internal(
|
|
7088
7583
|
if (lctx.backend_cpu != nullptr) {
|
7089
7584
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7090
7585
|
}
|
7586
|
+
|
7587
|
+
llama_set_inputs(lctx, batch);
|
7588
|
+
|
7091
7589
|
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
7092
7590
|
|
7093
7591
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
@@ -7127,7 +7625,7 @@ static int llama_decode_internal(
|
|
7127
7625
|
// extract logits
|
7128
7626
|
// TODO: do not compute and extract logits if only embeddings are needed
|
7129
7627
|
// need to update the graphs to skip "result_output"
|
7130
|
-
{
|
7628
|
+
if (res) {
|
7131
7629
|
auto & logits_out = lctx.logits;
|
7132
7630
|
|
7133
7631
|
#ifndef NDEBUG
|
@@ -7171,9 +7669,12 @@ static int llama_decode_internal(
|
|
7171
7669
|
if (!lctx.embedding.empty()) {
|
7172
7670
|
auto & embedding_out = lctx.embedding;
|
7173
7671
|
|
7174
|
-
|
7672
|
+
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
7673
|
+
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
7674
|
+
|
7675
|
+
embedding_out.resize(embd_size);
|
7175
7676
|
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
7176
|
-
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(),
|
7677
|
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
7177
7678
|
ggml_backend_synchronize(embeddings_backend);
|
7178
7679
|
}
|
7179
7680
|
|
@@ -7237,6 +7738,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
7237
7738
|
GGML_ASSERT(false);
|
7238
7739
|
return unicode_to_bytes_bpe(token_data.text);
|
7239
7740
|
}
|
7741
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
7742
|
+
GGML_ASSERT(false);
|
7743
|
+
}
|
7240
7744
|
default:
|
7241
7745
|
GGML_ASSERT(false);
|
7242
7746
|
}
|
@@ -7247,8 +7751,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
7247
7751
|
switch (llama_vocab_get_type(vocab)) {
|
7248
7752
|
case LLAMA_VOCAB_TYPE_SPM: {
|
7249
7753
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
7250
|
-
|
7754
|
+
auto token = vocab.token_to_id.find(buf);
|
7755
|
+
if (token != vocab.token_to_id.end()) {
|
7756
|
+
return (*token).second;
|
7757
|
+
}
|
7758
|
+
// Try to fall back to just the byte as a string
|
7759
|
+
const char buf2[2] = { (char)ch, 0 };
|
7760
|
+
return vocab.token_to_id.at(buf2);
|
7251
7761
|
}
|
7762
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
7252
7763
|
case LLAMA_VOCAB_TYPE_BPE: {
|
7253
7764
|
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
7254
7765
|
}
|
@@ -7294,7 +7805,7 @@ struct llm_bigram_spm {
|
|
7294
7805
|
};
|
7295
7806
|
|
7296
7807
|
struct llm_tokenizer_spm {
|
7297
|
-
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
|
7808
|
+
llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
|
7298
7809
|
|
7299
7810
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
7300
7811
|
// split string into utf8 chars
|
@@ -7369,6 +7880,7 @@ private:
|
|
7369
7880
|
|
7370
7881
|
if (p == rev_merge.end()) {
|
7371
7882
|
// output any symbols that did not form tokens as bytes.
|
7883
|
+
output.reserve(output.size() + symbol.n);
|
7372
7884
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
7373
7885
|
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
7374
7886
|
output.push_back(token_id);
|
@@ -7719,29 +8231,230 @@ private:
|
|
7719
8231
|
llm_bigram_bpe::queue work_queue;
|
7720
8232
|
};
|
7721
8233
|
|
7722
|
-
|
8234
|
+
struct llm_tokenizer_wpm {
|
8235
|
+
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
8236
|
+
|
8237
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
8238
|
+
auto * token_map = &vocab.token_to_id;
|
8239
|
+
|
8240
|
+
// normalize and split by whitespace
|
8241
|
+
std::vector<std::string> words = preprocess(text);
|
8242
|
+
|
8243
|
+
// bos token prepended already
|
8244
|
+
|
8245
|
+
// find the longest tokens that form the words
|
8246
|
+
for (const std::string &word : words) {
|
8247
|
+
// skip empty words
|
8248
|
+
if (word.size() == 0) {
|
8249
|
+
continue;
|
8250
|
+
}
|
8251
|
+
|
8252
|
+
// prepend phantom space
|
8253
|
+
std::string word1 = "\xe2\x96\x81" + word;
|
8254
|
+
int n = word1.size();
|
8255
|
+
|
8256
|
+
// we're at the start of a new word
|
8257
|
+
int i = 0;
|
8258
|
+
bool match_any = false;
|
8259
|
+
|
8260
|
+
// move through character position in word
|
8261
|
+
while (i < n) {
|
8262
|
+
// loop through possible match length
|
8263
|
+
bool match = false;
|
8264
|
+
for (int j = n; j > i; j--) {
|
8265
|
+
auto it = token_map->find(word1.substr(i, j - i));
|
8266
|
+
if (it != token_map->end()) {
|
8267
|
+
output.push_back(it->second);
|
8268
|
+
match = true;
|
8269
|
+
match_any = true;
|
8270
|
+
i = j;
|
8271
|
+
break;
|
8272
|
+
}
|
8273
|
+
}
|
8274
|
+
|
8275
|
+
// must be an unknown character
|
8276
|
+
if (!match) {
|
8277
|
+
i++;
|
8278
|
+
}
|
8279
|
+
}
|
8280
|
+
|
8281
|
+
// we didn't find any matches for this word
|
8282
|
+
if (!match_any) {
|
8283
|
+
output.push_back(vocab.special_unk_id);
|
8284
|
+
}
|
8285
|
+
}
|
8286
|
+
|
8287
|
+
// append eos token
|
8288
|
+
output.push_back(vocab.special_eos_id);
|
8289
|
+
}
|
8290
|
+
|
8291
|
+
std::vector<std::string> preprocess(const std::string & text) {
|
8292
|
+
std::string ori_str = normalize(text);
|
8293
|
+
uint64_t ori_size = ori_str.size();
|
8294
|
+
|
8295
|
+
// single punct / single symbol / single digit
|
8296
|
+
// baseline: add whitespace on the left and right of punct and chinese characters
|
8297
|
+
std::vector<std::string> words;
|
8298
|
+
std::string new_str = "";
|
8299
|
+
uint64_t i = 0;
|
8300
|
+
while (i < ori_size) {
|
8301
|
+
int utf_char_len = utf8_len(ori_str[i]);
|
8302
|
+
if ((utf_char_len == 1) && ispunct(ori_str[i])) {
|
8303
|
+
new_str += " ";
|
8304
|
+
new_str += ori_str[i];
|
8305
|
+
new_str += " ";
|
8306
|
+
i += 1;
|
8307
|
+
}
|
8308
|
+
else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
|
8309
|
+
new_str += " ";
|
8310
|
+
new_str += ori_str.substr(i, 3);
|
8311
|
+
new_str += " ";
|
8312
|
+
i += 3;
|
8313
|
+
}
|
8314
|
+
else {
|
8315
|
+
new_str += ori_str[i];
|
8316
|
+
i += 1;
|
8317
|
+
}
|
8318
|
+
}
|
8319
|
+
|
8320
|
+
// split by whitespace
|
8321
|
+
uint64_t l = 0;
|
8322
|
+
uint64_t r = 0;
|
8323
|
+
while (r < new_str.size()) {
|
8324
|
+
// if is whitespace
|
8325
|
+
if (isspace(new_str[r])) {
|
8326
|
+
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
8327
|
+
l = r + 1;
|
8328
|
+
r = l;
|
8329
|
+
}
|
8330
|
+
else {
|
8331
|
+
r += 1;
|
8332
|
+
}
|
8333
|
+
}
|
8334
|
+
if (r > l) {
|
8335
|
+
words.push_back(new_str.substr(l, (r - l)));
|
8336
|
+
}
|
8337
|
+
return words;
|
8338
|
+
}
|
8339
|
+
|
8340
|
+
std::string normalize(const std::string & text) {
|
8341
|
+
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
8342
|
+
std::string text2 = strip_accents(text);
|
8343
|
+
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
|
8344
|
+
char c = text2[i];
|
8345
|
+
if (c >= 'A' && c <= 'Z') {
|
8346
|
+
text2[i] = c - 'A' + 'a';
|
8347
|
+
}
|
8348
|
+
}
|
8349
|
+
return text2;
|
8350
|
+
}
|
8351
|
+
|
8352
|
+
bool is_chinese_char(const std::string & str) {
|
8353
|
+
int len = str.length();
|
8354
|
+
unsigned int codepoint = 0;
|
8355
|
+
int num_bytes = 0;
|
8356
|
+
int i = 0;
|
8357
|
+
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8358
|
+
if (ch <= 0x7f) {
|
8359
|
+
codepoint = ch;
|
8360
|
+
num_bytes = 1;
|
8361
|
+
} else if ((ch >> 5) == 0x06) {
|
8362
|
+
codepoint = ch & 0x1f;
|
8363
|
+
num_bytes = 2;
|
8364
|
+
} else if ((ch >> 4) == 0x0e) {
|
8365
|
+
codepoint = ch & 0x0f;
|
8366
|
+
num_bytes = 3;
|
8367
|
+
} else if ((ch >> 3) == 0x1e) {
|
8368
|
+
codepoint = ch & 0x07;
|
8369
|
+
num_bytes = 4;
|
8370
|
+
}
|
8371
|
+
for (int j = 1; j < num_bytes; ++j) {
|
8372
|
+
if (i + j >= len) {
|
8373
|
+
return false; // incomplete UTF-8 character
|
8374
|
+
}
|
8375
|
+
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8376
|
+
if ((next_ch >> 6) != 0x02) {
|
8377
|
+
return false; // invalid trailing byte
|
8378
|
+
}
|
8379
|
+
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8380
|
+
}
|
8381
|
+
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8382
|
+
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8383
|
+
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
8384
|
+
(codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
|
8385
|
+
(codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
|
8386
|
+
(codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
8387
|
+
(codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
|
8388
|
+
(codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
|
8389
|
+
(codepoint >= 0x3000 && codepoint <= 0x303F) ||
|
8390
|
+
(codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
|
8391
|
+
return true; // NOLINT
|
8392
|
+
}
|
8393
|
+
return false;
|
8394
|
+
}
|
8395
|
+
|
8396
|
+
std::string strip_accents(const std::string & input_string) {
|
8397
|
+
std::string resultString;
|
8398
|
+
std::map<std::string, char> accent_map = {
|
8399
|
+
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8400
|
+
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8401
|
+
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8402
|
+
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8403
|
+
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8404
|
+
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8405
|
+
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8406
|
+
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8407
|
+
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8408
|
+
};
|
8409
|
+
|
8410
|
+
for (size_t i = 0; i < input_string.length();) {
|
8411
|
+
int len = utf8_len(input_string[i]);
|
8412
|
+
std::string curChar = input_string.substr(i, len);
|
8413
|
+
auto iter = accent_map.find(curChar);
|
8414
|
+
if (iter != accent_map.end()) {
|
8415
|
+
resultString += iter->second;
|
8416
|
+
} else {
|
8417
|
+
resultString += curChar;
|
8418
|
+
}
|
8419
|
+
i += len;
|
8420
|
+
}
|
8421
|
+
|
8422
|
+
return resultString;
|
8423
|
+
}
|
8424
|
+
|
8425
|
+
static size_t utf8_len(char src) {
|
8426
|
+
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8427
|
+
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8428
|
+
return lookup[highbits];
|
8429
|
+
}
|
8430
|
+
|
8431
|
+
const llama_vocab & vocab;
|
8432
|
+
};
|
8433
|
+
|
8434
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
7723
8435
|
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
7724
8436
|
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
7725
8437
|
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
7726
8438
|
|
7727
|
-
struct fragment_buffer_variant{
|
8439
|
+
struct fragment_buffer_variant {
|
7728
8440
|
fragment_buffer_variant(llama_vocab::id _token)
|
7729
8441
|
:
|
7730
8442
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
7731
8443
|
token(_token),
|
7732
8444
|
raw_text(_dummy),
|
7733
8445
|
offset(0),
|
7734
|
-
length(0){}
|
8446
|
+
length(0) {}
|
8447
|
+
|
7735
8448
|
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
7736
8449
|
:
|
7737
8450
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
7738
|
-
token((llama_vocab::id)-1),
|
8451
|
+
token((llama_vocab::id) - 1),
|
7739
8452
|
raw_text(_raw_text),
|
7740
8453
|
offset(_offset),
|
7741
8454
|
length(_length){
|
7742
|
-
GGML_ASSERT(
|
7743
|
-
GGML_ASSERT(
|
7744
|
-
GGML_ASSERT(
|
8455
|
+
GGML_ASSERT(_offset >= 0);
|
8456
|
+
GGML_ASSERT(_length >= 1);
|
8457
|
+
GGML_ASSERT(offset + length <= raw_text.length());
|
7745
8458
|
}
|
7746
8459
|
|
7747
8460
|
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
@@ -7754,8 +8467,7 @@ struct fragment_buffer_variant{
|
|
7754
8467
|
|
7755
8468
|
// #define PRETOKENIZERDEBUG
|
7756
8469
|
|
7757
|
-
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
7758
|
-
{
|
8470
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
7759
8471
|
// for each special token
|
7760
8472
|
for (const auto & st: vocab.special_tokens_cache) {
|
7761
8473
|
const auto & special_token = st.first;
|
@@ -7866,17 +8578,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7866
8578
|
}
|
7867
8579
|
|
7868
8580
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
7869
|
-
fragment_buffer.emplace_front(
|
8581
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
7870
8582
|
|
7871
|
-
if (special) tokenizer_st_partition(
|
8583
|
+
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
7872
8584
|
|
7873
8585
|
switch (vocab.type) {
|
7874
8586
|
case LLAMA_VOCAB_TYPE_SPM:
|
7875
8587
|
{
|
7876
|
-
for (const auto & fragment: fragment_buffer)
|
7877
|
-
|
7878
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
7879
|
-
{
|
8588
|
+
for (const auto & fragment : fragment_buffer) {
|
8589
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
7880
8590
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
7881
8591
|
|
7882
8592
|
// TODO: It's likely possible to get rid of this string copy entirely
|
@@ -7896,19 +8606,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7896
8606
|
llm_tokenizer_spm tokenizer(vocab);
|
7897
8607
|
llama_escape_whitespace(raw_text);
|
7898
8608
|
tokenizer.tokenize(raw_text, output);
|
7899
|
-
}
|
7900
|
-
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
7901
|
-
{
|
8609
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
7902
8610
|
output.push_back(fragment.token);
|
7903
8611
|
}
|
7904
8612
|
}
|
7905
8613
|
} break;
|
7906
8614
|
case LLAMA_VOCAB_TYPE_BPE:
|
7907
8615
|
{
|
7908
|
-
for (const auto & fragment: fragment_buffer)
|
7909
|
-
|
7910
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
7911
|
-
{
|
8616
|
+
for (const auto & fragment : fragment_buffer) {
|
8617
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
7912
8618
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
7913
8619
|
|
7914
8620
|
#ifdef PRETOKENIZERDEBUG
|
@@ -7916,9 +8622,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7916
8622
|
#endif
|
7917
8623
|
llm_tokenizer_bpe tokenizer(vocab);
|
7918
8624
|
tokenizer.tokenize(raw_text, output);
|
8625
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
8626
|
+
output.push_back(fragment.token);
|
7919
8627
|
}
|
7920
|
-
|
7921
|
-
|
8628
|
+
}
|
8629
|
+
} break;
|
8630
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
8631
|
+
{
|
8632
|
+
for (const auto & fragment : fragment_buffer) {
|
8633
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
8634
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
8635
|
+
|
8636
|
+
#ifdef PRETOKENIZERDEBUG
|
8637
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
8638
|
+
#endif
|
8639
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
8640
|
+
tokenizer.tokenize(raw_text, output);
|
8641
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
7922
8642
|
output.push_back(fragment.token);
|
7923
8643
|
}
|
7924
8644
|
}
|
@@ -8373,6 +9093,10 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
|
|
8373
9093
|
|
8374
9094
|
const int64_t t_start_sample_us = ggml_time_us();
|
8375
9095
|
|
9096
|
+
if (k <= 0) {
|
9097
|
+
k = candidates->size;
|
9098
|
+
}
|
9099
|
+
|
8376
9100
|
k = std::max(k, (int) min_keep);
|
8377
9101
|
k = std::min(k, (int) candidates->size);
|
8378
9102
|
|
@@ -9456,8 +10180,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9456
10180
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
9457
10181
|
new_type = GGML_TYPE_Q4_K;
|
9458
10182
|
}
|
9459
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS
|
9460
|
-
new_type = GGML_TYPE_Q4_K;
|
10183
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10184
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
|
9461
10185
|
}
|
9462
10186
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
9463
10187
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
@@ -9496,9 +10220,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9496
10220
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
9497
10221
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
9498
10222
|
}
|
9499
|
-
|
9500
|
-
|
9501
|
-
|
10223
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
10224
|
+
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
10225
|
+
}
|
9502
10226
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
9503
10227
|
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
9504
10228
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
@@ -9566,6 +10290,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9566
10290
|
}
|
9567
10291
|
++qs.i_ffn_up;
|
9568
10292
|
}
|
10293
|
+
|
9569
10294
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
9570
10295
|
//}
|
9571
10296
|
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
@@ -9625,19 +10350,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9625
10350
|
|
9626
10351
|
// K-quants
|
9627
10352
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
9628
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
10353
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
9629
10354
|
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
9630
10355
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
9631
10356
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
9632
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
10357
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
9633
10358
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
9634
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
10359
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
9635
10360
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
9636
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
9637
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
9638
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
9639
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS
|
9640
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10361
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
10362
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10363
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10364
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10365
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
9641
10366
|
|
9642
10367
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
9643
10368
|
}
|
@@ -9767,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9767
10492
|
quantize &= !params->only_copy;
|
9768
10493
|
|
9769
10494
|
// do not quantize expert gating tensors
|
9770
|
-
quantize &= name.
|
10495
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
|
10496
|
+
|
10497
|
+
// do not quantize positional embeddings and token types (BERT)
|
10498
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
10499
|
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
9771
10500
|
|
9772
10501
|
enum ggml_type new_type;
|
9773
10502
|
void * new_data;
|
@@ -10269,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
|
|
10269
10998
|
/*.logits_all =*/ false,
|
10270
10999
|
/*.embedding =*/ false,
|
10271
11000
|
/*.offload_kqv =*/ true,
|
11001
|
+
/*.do_pooling =*/ true,
|
10272
11002
|
};
|
10273
11003
|
|
10274
11004
|
return result;
|
@@ -10295,6 +11025,8 @@ size_t llama_max_devices(void) {
|
|
10295
11025
|
return GGML_CUDA_MAX_DEVICES;
|
10296
11026
|
#elif defined(GGML_USE_SYCL)
|
10297
11027
|
return GGML_SYCL_MAX_DEVICES;
|
11028
|
+
#elif defined(GGML_USE_VULKAN)
|
11029
|
+
return GGML_VK_MAX_DEVICES;
|
10298
11030
|
#else
|
10299
11031
|
return 1;
|
10300
11032
|
#endif
|
@@ -10422,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
|
|
10422
11154
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
10423
11155
|
cparams.mul_mat_q = params.mul_mat_q;
|
10424
11156
|
cparams.offload_kqv = params.offload_kqv;
|
11157
|
+
cparams.do_pooling = params.do_pooling;
|
10425
11158
|
|
10426
11159
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
10427
11160
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -10502,13 +11235,15 @@ struct llama_context * llama_new_context_with_model(
|
|
10502
11235
|
}
|
10503
11236
|
#elif defined(GGML_USE_VULKAN)
|
10504
11237
|
if (model->n_gpu_layers > 0) {
|
10505
|
-
|
10506
|
-
|
10507
|
-
|
10508
|
-
|
10509
|
-
|
11238
|
+
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
11239
|
+
ggml_backend_t backend = ggml_backend_vk_init(device);
|
11240
|
+
if (backend == nullptr) {
|
11241
|
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
11242
|
+
llama_free(ctx);
|
11243
|
+
return nullptr;
|
11244
|
+
}
|
11245
|
+
ctx->backends.push_back(backend);
|
10510
11246
|
}
|
10511
|
-
ctx->backends.push_back(backend);
|
10512
11247
|
}
|
10513
11248
|
#elif defined(GGML_USE_SYCL)
|
10514
11249
|
if (model->n_gpu_layers > 0) {
|
@@ -10567,14 +11302,14 @@ struct llama_context * llama_new_context_with_model(
|
|
10567
11302
|
// resized during inference, reserve maximum
|
10568
11303
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
10569
11304
|
|
10570
|
-
if (params.embedding){
|
11305
|
+
if (params.embedding) {
|
10571
11306
|
ctx->embedding.resize(hparams.n_embd);
|
10572
11307
|
}
|
10573
11308
|
|
10574
11309
|
// graph inputs
|
10575
11310
|
{
|
10576
11311
|
ggml_init_params init_params = {
|
10577
|
-
/* .mem_size */ ggml_tensor_overhead()*
|
11312
|
+
/* .mem_size */ ggml_tensor_overhead()*7,
|
10578
11313
|
/* .mem_buffer */ nullptr,
|
10579
11314
|
/* .no_alloc */ true,
|
10580
11315
|
};
|
@@ -10585,12 +11320,14 @@ struct llama_context * llama_new_context_with_model(
|
|
10585
11320
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
10586
11321
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
10587
11322
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
11323
|
+
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
10588
11324
|
|
10589
11325
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
10590
11326
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
10591
11327
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
10592
11328
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
10593
11329
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11330
|
+
ggml_set_name(ctx->inp_sum, "inp_sum");
|
10594
11331
|
|
10595
11332
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
10596
11333
|
|
@@ -10616,23 +11353,27 @@ struct llama_context * llama_new_context_with_model(
|
|
10616
11353
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
10617
11354
|
|
10618
11355
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
10619
|
-
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
10620
11356
|
|
10621
11357
|
// build worst-case graph
|
10622
11358
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
10623
11359
|
int n_past = cparams.n_ctx - n_tokens;
|
10624
11360
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
10625
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
11361
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
10626
11362
|
|
10627
11363
|
// initialize scheduler with the worst-case graph
|
10628
|
-
|
10629
|
-
|
11364
|
+
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
|
11365
|
+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
11366
|
+
llama_free(ctx);
|
11367
|
+
return nullptr;
|
11368
|
+
}
|
10630
11369
|
|
10631
|
-
for (
|
10632
|
-
|
11370
|
+
for (size_t i = 0; i < ctx->backends.size(); i++) {
|
11371
|
+
ggml_backend_t backend = ctx->backends[i];
|
11372
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
11373
|
+
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
10633
11374
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
10634
|
-
|
10635
|
-
|
11375
|
+
ggml_backend_buft_name(buft),
|
11376
|
+
size / 1024.0 / 1024.0);
|
10636
11377
|
}
|
10637
11378
|
|
10638
11379
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
@@ -10735,7 +11476,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
|
10735
11476
|
|
10736
11477
|
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
10737
11478
|
return snprintf(buf, buf_size, "%s %s %s",
|
10738
|
-
llama_model_arch_name(model->arch)
|
11479
|
+
llama_model_arch_name(model->arch),
|
10739
11480
|
llama_model_type_name(model->type),
|
10740
11481
|
llama_model_ftype_name(model->ftype).c_str());
|
10741
11482
|
}
|
@@ -11437,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
11437
12178
|
return ctx->embedding.data();
|
11438
12179
|
}
|
11439
12180
|
|
12181
|
+
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12182
|
+
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
|
12183
|
+
}
|
12184
|
+
|
11440
12185
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
11441
12186
|
return model->vocab.id_to_token[token].text.c_str();
|
11442
12187
|
}
|
@@ -11521,6 +12266,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
11521
12266
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
11522
12267
|
if (0 <= token && token < llama_n_vocab(model)) {
|
11523
12268
|
switch (llama_vocab_get_type(model->vocab)) {
|
12269
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
11524
12270
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11525
12271
|
// NOTE: we accept all unsupported token types,
|
11526
12272
|
// suppressing them like CONTROL tokens.
|
@@ -11644,6 +12390,7 @@ const char * llama_print_system_info(void) {
|
|
11644
12390
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
11645
12391
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
11646
12392
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
12393
|
+
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
11647
12394
|
|
11648
12395
|
return s.c_str();
|
11649
12396
|
}
|