llama_cpp 0.16.0 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/extconf.rb +2 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +110 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
- data/vendor/tmp/llama.cpp/ggml.c +102 -275
- data/vendor/tmp/llama.cpp/llama.cpp +103 -47
- data/vendor/tmp/llama.cpp/llama.h +4 -0
- metadata +15 -3
@@ -21,6 +21,10 @@
|
|
21
21
|
# include "ggml-kompute.h"
|
22
22
|
#endif
|
23
23
|
|
24
|
+
#ifdef GGML_USE_BLAS
|
25
|
+
# include "ggml-blas.h"
|
26
|
+
#endif
|
27
|
+
|
24
28
|
#ifdef GGML_USE_METAL
|
25
29
|
# include "ggml-metal.h"
|
26
30
|
#endif
|
@@ -704,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
704
708
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
705
709
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
706
710
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
711
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
707
712
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
708
713
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
709
714
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
@@ -2298,9 +2303,13 @@ struct llama_context {
|
|
2298
2303
|
std::vector<ggml_backend_t> backends;
|
2299
2304
|
#ifdef GGML_USE_METAL
|
2300
2305
|
ggml_backend_t backend_metal = nullptr;
|
2306
|
+
#endif
|
2307
|
+
#ifdef GGML_USE_BLAS
|
2308
|
+
ggml_backend_t backend_blas = nullptr;
|
2301
2309
|
#endif
|
2302
2310
|
ggml_backend_t backend_cpu = nullptr;
|
2303
2311
|
|
2312
|
+
|
2304
2313
|
const llama_model & model;
|
2305
2314
|
|
2306
2315
|
// key + value cache for the self attention
|
@@ -4552,35 +4561,6 @@ static void llm_load_vocab(
|
|
4552
4561
|
vocab.special_cls_id = -1;
|
4553
4562
|
vocab.special_mask_id = -1;
|
4554
4563
|
|
4555
|
-
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4556
|
-
// prior to support of FIM special tokens in GGUF, the following
|
4557
|
-
// will allow those models to continue to work. The general names
|
4558
|
-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4559
|
-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4560
|
-
// new versions of these models have been published.
|
4561
|
-
std::string gen_name;
|
4562
|
-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4563
|
-
|
4564
|
-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4565
|
-
[](unsigned char c){ return std::tolower(c); });
|
4566
|
-
|
4567
|
-
if (gen_name.find("code") != std::string::npos) {
|
4568
|
-
if (model.arch == LLM_ARCH_LLAMA) {
|
4569
|
-
vocab.special_prefix_id = 32007;
|
4570
|
-
vocab.special_suffix_id = 32008;
|
4571
|
-
vocab.special_middle_id = 32009;
|
4572
|
-
vocab.special_eot_id = 32010;
|
4573
|
-
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4574
|
-
vocab.special_prefix_id = 67;
|
4575
|
-
vocab.special_suffix_id = 69;
|
4576
|
-
vocab.special_middle_id = 68;
|
4577
|
-
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4578
|
-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4579
|
-
//vocab.special_eot_id = 70;
|
4580
|
-
vocab.special_eot_id = 107;
|
4581
|
-
}
|
4582
|
-
}
|
4583
|
-
|
4584
4564
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4585
4565
|
if (add_space_prefix_keyidx != -1) {
|
4586
4566
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
@@ -4653,8 +4633,7 @@ static void llm_load_vocab(
|
|
4653
4633
|
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4654
4634
|
LLAMA_LOG_WARN("%s: \n", __func__);
|
4655
4635
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4656
|
-
} else if (
|
4657
|
-
tokenizer_pre == "default") {
|
4636
|
+
} else if (tokenizer_pre == "default") {
|
4658
4637
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4659
4638
|
} else if (
|
4660
4639
|
tokenizer_pre == "llama3" ||
|
@@ -4681,7 +4660,8 @@ static void llm_load_vocab(
|
|
4681
4660
|
tokenizer_pre == "jina-es" ||
|
4682
4661
|
tokenizer_pre == "jina-de" ||
|
4683
4662
|
tokenizer_pre == "jina-v2-es" ||
|
4684
|
-
tokenizer_pre == "jina-v2-de"
|
4663
|
+
tokenizer_pre == "jina-v2-de" ||
|
4664
|
+
tokenizer_pre == "jina-v2-code") {
|
4685
4665
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4686
4666
|
} else if (
|
4687
4667
|
tokenizer_pre == "refact") {
|
@@ -4704,6 +4684,9 @@ static void llm_load_vocab(
|
|
4704
4684
|
} else if (
|
4705
4685
|
tokenizer_pre == "smaug-bpe") {
|
4706
4686
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4687
|
+
} else if (
|
4688
|
+
tokenizer_pre == "poro-chat") {
|
4689
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
4707
4690
|
} else {
|
4708
4691
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4709
4692
|
}
|
@@ -4761,6 +4744,45 @@ static void llm_load_vocab(
|
|
4761
4744
|
|
4762
4745
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
4763
4746
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
4747
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4748
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4749
|
+
// will allow those models to continue to work. The general names
|
4750
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4751
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4752
|
+
// new versions of these models have been published.
|
4753
|
+
std::string gen_name;
|
4754
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4755
|
+
|
4756
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4757
|
+
[](unsigned char c){ return std::tolower(c); });
|
4758
|
+
|
4759
|
+
if (gen_name.find("code") != std::string::npos) {
|
4760
|
+
if (model.arch == LLM_ARCH_LLAMA
|
4761
|
+
&& 32010 < vocab.id_to_token.size()
|
4762
|
+
&& vocab.id_to_token[32007].text == "<PRE>"
|
4763
|
+
&& vocab.id_to_token[32008].text == "<SUF>"
|
4764
|
+
&& vocab.id_to_token[32009].text == "<MID>"
|
4765
|
+
&& vocab.id_to_token[32010].text == "<EOT>") {
|
4766
|
+
vocab.special_prefix_id = 32007;
|
4767
|
+
vocab.special_suffix_id = 32008;
|
4768
|
+
vocab.special_middle_id = 32009;
|
4769
|
+
vocab.special_eot_id = 32010;
|
4770
|
+
} else if (model.arch == LLM_ARCH_GEMMA
|
4771
|
+
&& 107 < vocab.id_to_token.size()
|
4772
|
+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
4773
|
+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
4774
|
+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
4775
|
+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
4776
|
+
vocab.special_prefix_id = 67;
|
4777
|
+
vocab.special_suffix_id = 69;
|
4778
|
+
vocab.special_middle_id = 68;
|
4779
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4780
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4781
|
+
//vocab.special_eot_id = 70;
|
4782
|
+
vocab.special_eot_id = 107;
|
4783
|
+
}
|
4784
|
+
}
|
4785
|
+
|
4764
4786
|
try {
|
4765
4787
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
4766
4788
|
} catch (const std::exception & e) {
|
@@ -5515,7 +5537,7 @@ static bool llm_load_tensors(
|
|
5515
5537
|
|
5516
5538
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5517
5539
|
} else {
|
5518
|
-
layer.ffn_gate
|
5540
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5519
5541
|
}
|
5520
5542
|
|
5521
5543
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
@@ -5556,6 +5578,9 @@ static bool llm_load_tensors(
|
|
5556
5578
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5557
5579
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5558
5580
|
|
5581
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5582
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5583
|
+
|
5559
5584
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5560
5585
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5561
5586
|
|
@@ -8519,6 +8544,11 @@ struct llm_build_context {
|
|
8519
8544
|
// attention layer norm
|
8520
8545
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
8521
8546
|
|
8547
|
+
if (model.layers[il].attn_norm_2 != nullptr) {
|
8548
|
+
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
|
8549
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
|
8550
|
+
}
|
8551
|
+
|
8522
8552
|
struct ggml_tensor * ffn_inp = cur;
|
8523
8553
|
cb(ffn_inp, "ffn_inp", il);
|
8524
8554
|
|
@@ -11520,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11520
11550
|
if (batch.n_tokens < 32 || full_offload) {
|
11521
11551
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
11522
11552
|
for (auto * backend : lctx.backends) {
|
11523
|
-
if (
|
11553
|
+
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
11554
|
+
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
11524
11555
|
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
11525
11556
|
break;
|
11526
11557
|
}
|
@@ -12017,6 +12048,11 @@ static void llama_graph_compute(
|
|
12017
12048
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
12018
12049
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
12019
12050
|
}
|
12051
|
+
#ifdef GGML_USE_BLAS
|
12052
|
+
if (lctx.backend_blas != nullptr) {
|
12053
|
+
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
|
12054
|
+
}
|
12055
|
+
#endif
|
12020
12056
|
|
12021
12057
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
12022
12058
|
|
@@ -12239,17 +12275,6 @@ static int llama_decode_internal(
|
|
12239
12275
|
}
|
12240
12276
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
12241
12277
|
|
12242
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
12243
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
12244
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
12245
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
12246
|
-
// with the BLAS calls. need a better solution
|
12247
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
12248
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
12249
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
12250
|
-
n_threads = std::min(4, n_threads);
|
12251
|
-
}
|
12252
|
-
|
12253
12278
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
12254
12279
|
|
12255
12280
|
llama_set_inputs(lctx, u_batch);
|
@@ -13016,6 +13041,11 @@ struct llm_tokenizer_bpe {
|
|
13016
13041
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13017
13042
|
});
|
13018
13043
|
break;
|
13044
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
13045
|
+
word_collection = unicode_regex_split(text, {
|
13046
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
13047
|
+
});
|
13048
|
+
break;
|
13019
13049
|
default:
|
13020
13050
|
// default regex for BPE tokenization pre-processing
|
13021
13051
|
word_collection = unicode_regex_split(text, {
|
@@ -13631,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
13631
13661
|
const uint32_t chr) {
|
13632
13662
|
|
13633
13663
|
bool found = false;
|
13634
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
13664
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
13635
13665
|
|
13636
13666
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
13637
13667
|
|
@@ -13640,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
13640
13670
|
// inclusive range, e.g. [a-z]
|
13641
13671
|
found = found || (pos->value <= chr && chr <= pos[1].value);
|
13642
13672
|
pos += 2;
|
13673
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
13674
|
+
// Any character matches "."
|
13675
|
+
found = true;
|
13676
|
+
pos += 1;
|
13643
13677
|
} else {
|
13644
13678
|
// exact char match, e.g. [a] or "a"
|
13645
13679
|
found = found || pos->value == chr;
|
@@ -13657,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
|
|
13657
13691
|
const llama_grammar_element * pos,
|
13658
13692
|
const llama_partial_utf8 partial_utf8) {
|
13659
13693
|
|
13660
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
13694
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
13661
13695
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
13662
13696
|
|
13663
13697
|
uint32_t partial_value = partial_utf8.value;
|
@@ -13687,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
|
|
13687
13721
|
return is_positive_char;
|
13688
13722
|
}
|
13689
13723
|
pos += 2;
|
13724
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
13725
|
+
// Any character matches "."
|
13726
|
+
return true;
|
13690
13727
|
} else {
|
13691
13728
|
// exact char match, e.g. [a] or "a"
|
13692
13729
|
if (low <= pos->value && pos->value <= high) {
|
@@ -13747,6 +13784,7 @@ static void llama_grammar_advance_stack(
|
|
13747
13784
|
}
|
13748
13785
|
case LLAMA_GRETYPE_CHAR:
|
13749
13786
|
case LLAMA_GRETYPE_CHAR_NOT:
|
13787
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
13750
13788
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
13751
13789
|
// only add the stack if it's not a duplicate of one we already have
|
13752
13790
|
new_stacks.emplace_back(stack);
|
@@ -15220,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
15220
15258
|
if (imatrix_data) {
|
15221
15259
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
15222
15260
|
qs.has_imatrix = true;
|
15261
|
+
// check imatrix for nans or infs
|
15262
|
+
for (const auto & kv : *imatrix_data) {
|
15263
|
+
for (float f : kv.second) {
|
15264
|
+
if (!std::isfinite(f)) {
|
15265
|
+
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
|
15266
|
+
}
|
15267
|
+
}
|
15268
|
+
}
|
15223
15269
|
}
|
15224
15270
|
}
|
15225
15271
|
|
@@ -16226,6 +16272,16 @@ struct llama_context * llama_new_context_with_model(
|
|
16226
16272
|
ctx->backends.push_back(backend);
|
16227
16273
|
}
|
16228
16274
|
#endif
|
16275
|
+
|
16276
|
+
#ifdef GGML_USE_BLAS
|
16277
|
+
ctx->backend_blas = ggml_backend_blas_init();
|
16278
|
+
if (ctx->backend_blas == nullptr) {
|
16279
|
+
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
16280
|
+
} else {
|
16281
|
+
ctx->backends.push_back(ctx->backend_blas);
|
16282
|
+
}
|
16283
|
+
#endif
|
16284
|
+
|
16229
16285
|
#if defined(GGML_USE_RPC)
|
16230
16286
|
if (model->n_gpu_layers > 0) {
|
16231
16287
|
for (const auto & endpoint : model->rpc_servers) {
|
@@ -86,6 +86,7 @@ extern "C" {
|
|
86
86
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
87
87
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
88
88
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
89
|
+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
89
90
|
};
|
90
91
|
|
91
92
|
// note: these values should be synchronized with ggml_rope
|
@@ -365,6 +366,9 @@ extern "C" {
|
|
365
366
|
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
366
367
|
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
367
368
|
LLAMA_GRETYPE_CHAR_ALT = 6,
|
369
|
+
|
370
|
+
// any character (.)
|
371
|
+
LLAMA_GRETYPE_CHAR_ANY = 7,
|
368
372
|
};
|
369
373
|
|
370
374
|
typedef struct llama_grammar_element {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.16.
|
4
|
+
version: 0.16.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-06-
|
11
|
+
date: 2024-06-15 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -42,6 +42,8 @@ files:
|
|
42
42
|
- vendor/tmp/llama.cpp/ggml-backend-impl.h
|
43
43
|
- vendor/tmp/llama.cpp/ggml-backend.c
|
44
44
|
- vendor/tmp/llama.cpp/ggml-backend.h
|
45
|
+
- vendor/tmp/llama.cpp/ggml-blas.cpp
|
46
|
+
- vendor/tmp/llama.cpp/ggml-blas.h
|
45
47
|
- vendor/tmp/llama.cpp/ggml-common.h
|
46
48
|
- vendor/tmp/llama.cpp/ggml-cuda.cu
|
47
49
|
- vendor/tmp/llama.cpp/ggml-cuda.h
|
@@ -161,6 +163,16 @@ files:
|
|
161
163
|
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
|
162
164
|
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
|
163
165
|
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
|
166
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu
|
167
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu
|
168
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu
|
169
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu
|
170
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu
|
171
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu
|
172
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu
|
173
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu
|
174
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu
|
175
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu
|
164
176
|
- vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu
|
165
177
|
- vendor/tmp/llama.cpp/ggml-cuda/unary.cu
|
166
178
|
- vendor/tmp/llama.cpp/ggml-cuda/upscale.cu
|
@@ -214,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
214
226
|
- !ruby/object:Gem::Version
|
215
227
|
version: '0'
|
216
228
|
requirements: []
|
217
|
-
rubygems_version: 3.5.
|
229
|
+
rubygems_version: 3.5.9
|
218
230
|
signing_key:
|
219
231
|
specification_version: 4
|
220
232
|
summary: Ruby bindings for the llama.cpp.
|