llama_cpp 0.16.0 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/extconf.rb +2 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +110 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
- data/vendor/tmp/llama.cpp/ggml.c +102 -275
- data/vendor/tmp/llama.cpp/llama.cpp +103 -47
- data/vendor/tmp/llama.cpp/llama.h +4 -0
- metadata +15 -3
|
@@ -21,6 +21,10 @@
|
|
|
21
21
|
# include "ggml-kompute.h"
|
|
22
22
|
#endif
|
|
23
23
|
|
|
24
|
+
#ifdef GGML_USE_BLAS
|
|
25
|
+
# include "ggml-blas.h"
|
|
26
|
+
#endif
|
|
27
|
+
|
|
24
28
|
#ifdef GGML_USE_METAL
|
|
25
29
|
# include "ggml-metal.h"
|
|
26
30
|
#endif
|
|
@@ -704,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
704
708
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
705
709
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
706
710
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
711
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
|
707
712
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
708
713
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
709
714
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
@@ -2298,9 +2303,13 @@ struct llama_context {
|
|
|
2298
2303
|
std::vector<ggml_backend_t> backends;
|
|
2299
2304
|
#ifdef GGML_USE_METAL
|
|
2300
2305
|
ggml_backend_t backend_metal = nullptr;
|
|
2306
|
+
#endif
|
|
2307
|
+
#ifdef GGML_USE_BLAS
|
|
2308
|
+
ggml_backend_t backend_blas = nullptr;
|
|
2301
2309
|
#endif
|
|
2302
2310
|
ggml_backend_t backend_cpu = nullptr;
|
|
2303
2311
|
|
|
2312
|
+
|
|
2304
2313
|
const llama_model & model;
|
|
2305
2314
|
|
|
2306
2315
|
// key + value cache for the self attention
|
|
@@ -4552,35 +4561,6 @@ static void llm_load_vocab(
|
|
|
4552
4561
|
vocab.special_cls_id = -1;
|
|
4553
4562
|
vocab.special_mask_id = -1;
|
|
4554
4563
|
|
|
4555
|
-
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
4556
|
-
// prior to support of FIM special tokens in GGUF, the following
|
|
4557
|
-
// will allow those models to continue to work. The general names
|
|
4558
|
-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
|
4559
|
-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
|
4560
|
-
// new versions of these models have been published.
|
|
4561
|
-
std::string gen_name;
|
|
4562
|
-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
|
4563
|
-
|
|
4564
|
-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
|
4565
|
-
[](unsigned char c){ return std::tolower(c); });
|
|
4566
|
-
|
|
4567
|
-
if (gen_name.find("code") != std::string::npos) {
|
|
4568
|
-
if (model.arch == LLM_ARCH_LLAMA) {
|
|
4569
|
-
vocab.special_prefix_id = 32007;
|
|
4570
|
-
vocab.special_suffix_id = 32008;
|
|
4571
|
-
vocab.special_middle_id = 32009;
|
|
4572
|
-
vocab.special_eot_id = 32010;
|
|
4573
|
-
} else if (model.arch == LLM_ARCH_GEMMA) {
|
|
4574
|
-
vocab.special_prefix_id = 67;
|
|
4575
|
-
vocab.special_suffix_id = 69;
|
|
4576
|
-
vocab.special_middle_id = 68;
|
|
4577
|
-
// TODO: this is not EOT, it is "file separator" token, needs fix
|
|
4578
|
-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
|
4579
|
-
//vocab.special_eot_id = 70;
|
|
4580
|
-
vocab.special_eot_id = 107;
|
|
4581
|
-
}
|
|
4582
|
-
}
|
|
4583
|
-
|
|
4584
4564
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
|
4585
4565
|
if (add_space_prefix_keyidx != -1) {
|
|
4586
4566
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
@@ -4653,8 +4633,7 @@ static void llm_load_vocab(
|
|
|
4653
4633
|
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
|
4654
4634
|
LLAMA_LOG_WARN("%s: \n", __func__);
|
|
4655
4635
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4656
|
-
} else if (
|
|
4657
|
-
tokenizer_pre == "default") {
|
|
4636
|
+
} else if (tokenizer_pre == "default") {
|
|
4658
4637
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4659
4638
|
} else if (
|
|
4660
4639
|
tokenizer_pre == "llama3" ||
|
|
@@ -4681,7 +4660,8 @@ static void llm_load_vocab(
|
|
|
4681
4660
|
tokenizer_pre == "jina-es" ||
|
|
4682
4661
|
tokenizer_pre == "jina-de" ||
|
|
4683
4662
|
tokenizer_pre == "jina-v2-es" ||
|
|
4684
|
-
tokenizer_pre == "jina-v2-de"
|
|
4663
|
+
tokenizer_pre == "jina-v2-de" ||
|
|
4664
|
+
tokenizer_pre == "jina-v2-code") {
|
|
4685
4665
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
4686
4666
|
} else if (
|
|
4687
4667
|
tokenizer_pre == "refact") {
|
|
@@ -4704,6 +4684,9 @@ static void llm_load_vocab(
|
|
|
4704
4684
|
} else if (
|
|
4705
4685
|
tokenizer_pre == "smaug-bpe") {
|
|
4706
4686
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
|
4687
|
+
} else if (
|
|
4688
|
+
tokenizer_pre == "poro-chat") {
|
|
4689
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
|
4707
4690
|
} else {
|
|
4708
4691
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
4709
4692
|
}
|
|
@@ -4761,6 +4744,45 @@ static void llm_load_vocab(
|
|
|
4761
4744
|
|
|
4762
4745
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
|
4763
4746
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
|
4747
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
4748
|
+
// prior to support of FIM special tokens in GGUF, the following
|
|
4749
|
+
// will allow those models to continue to work. The general names
|
|
4750
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
|
4751
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
|
4752
|
+
// new versions of these models have been published.
|
|
4753
|
+
std::string gen_name;
|
|
4754
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
|
4755
|
+
|
|
4756
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
|
4757
|
+
[](unsigned char c){ return std::tolower(c); });
|
|
4758
|
+
|
|
4759
|
+
if (gen_name.find("code") != std::string::npos) {
|
|
4760
|
+
if (model.arch == LLM_ARCH_LLAMA
|
|
4761
|
+
&& 32010 < vocab.id_to_token.size()
|
|
4762
|
+
&& vocab.id_to_token[32007].text == "<PRE>"
|
|
4763
|
+
&& vocab.id_to_token[32008].text == "<SUF>"
|
|
4764
|
+
&& vocab.id_to_token[32009].text == "<MID>"
|
|
4765
|
+
&& vocab.id_to_token[32010].text == "<EOT>") {
|
|
4766
|
+
vocab.special_prefix_id = 32007;
|
|
4767
|
+
vocab.special_suffix_id = 32008;
|
|
4768
|
+
vocab.special_middle_id = 32009;
|
|
4769
|
+
vocab.special_eot_id = 32010;
|
|
4770
|
+
} else if (model.arch == LLM_ARCH_GEMMA
|
|
4771
|
+
&& 107 < vocab.id_to_token.size()
|
|
4772
|
+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
|
4773
|
+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
|
4774
|
+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
|
4775
|
+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
|
4776
|
+
vocab.special_prefix_id = 67;
|
|
4777
|
+
vocab.special_suffix_id = 69;
|
|
4778
|
+
vocab.special_middle_id = 68;
|
|
4779
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
|
4780
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
|
4781
|
+
//vocab.special_eot_id = 70;
|
|
4782
|
+
vocab.special_eot_id = 107;
|
|
4783
|
+
}
|
|
4784
|
+
}
|
|
4785
|
+
|
|
4764
4786
|
try {
|
|
4765
4787
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
|
4766
4788
|
} catch (const std::exception & e) {
|
|
@@ -5515,7 +5537,7 @@ static bool llm_load_tensors(
|
|
|
5515
5537
|
|
|
5516
5538
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
5517
5539
|
} else {
|
|
5518
|
-
layer.ffn_gate
|
|
5540
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5519
5541
|
}
|
|
5520
5542
|
|
|
5521
5543
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
@@ -5556,6 +5578,9 @@ static bool llm_load_tensors(
|
|
|
5556
5578
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
|
5557
5579
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
|
5558
5580
|
|
|
5581
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5582
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5583
|
+
|
|
5559
5584
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5560
5585
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5561
5586
|
|
|
@@ -8519,6 +8544,11 @@ struct llm_build_context {
|
|
|
8519
8544
|
// attention layer norm
|
|
8520
8545
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
|
8521
8546
|
|
|
8547
|
+
if (model.layers[il].attn_norm_2 != nullptr) {
|
|
8548
|
+
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
|
|
8549
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
|
|
8550
|
+
}
|
|
8551
|
+
|
|
8522
8552
|
struct ggml_tensor * ffn_inp = cur;
|
|
8523
8553
|
cb(ffn_inp, "ffn_inp", il);
|
|
8524
8554
|
|
|
@@ -11520,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
11520
11550
|
if (batch.n_tokens < 32 || full_offload) {
|
|
11521
11551
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
|
11522
11552
|
for (auto * backend : lctx.backends) {
|
|
11523
|
-
if (
|
|
11553
|
+
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
|
11554
|
+
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
|
11524
11555
|
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
|
11525
11556
|
break;
|
|
11526
11557
|
}
|
|
@@ -12017,6 +12048,11 @@ static void llama_graph_compute(
|
|
|
12017
12048
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
12018
12049
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
|
12019
12050
|
}
|
|
12051
|
+
#ifdef GGML_USE_BLAS
|
|
12052
|
+
if (lctx.backend_blas != nullptr) {
|
|
12053
|
+
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
|
|
12054
|
+
}
|
|
12055
|
+
#endif
|
|
12020
12056
|
|
|
12021
12057
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
|
12022
12058
|
|
|
@@ -12239,17 +12275,6 @@ static int llama_decode_internal(
|
|
|
12239
12275
|
}
|
|
12240
12276
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
12241
12277
|
|
|
12242
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
12243
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
|
12244
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
|
12245
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
|
12246
|
-
// with the BLAS calls. need a better solution
|
|
12247
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
|
12248
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
|
12249
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
|
12250
|
-
n_threads = std::min(4, n_threads);
|
|
12251
|
-
}
|
|
12252
|
-
|
|
12253
12278
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
|
12254
12279
|
|
|
12255
12280
|
llama_set_inputs(lctx, u_batch);
|
|
@@ -13016,6 +13041,11 @@ struct llm_tokenizer_bpe {
|
|
|
13016
13041
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
13017
13042
|
});
|
|
13018
13043
|
break;
|
|
13044
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
|
13045
|
+
word_collection = unicode_regex_split(text, {
|
|
13046
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
|
13047
|
+
});
|
|
13048
|
+
break;
|
|
13019
13049
|
default:
|
|
13020
13050
|
// default regex for BPE tokenization pre-processing
|
|
13021
13051
|
word_collection = unicode_regex_split(text, {
|
|
@@ -13631,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
|
13631
13661
|
const uint32_t chr) {
|
|
13632
13662
|
|
|
13633
13663
|
bool found = false;
|
|
13634
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
13664
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
|
13635
13665
|
|
|
13636
13666
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
|
13637
13667
|
|
|
@@ -13640,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
|
13640
13670
|
// inclusive range, e.g. [a-z]
|
|
13641
13671
|
found = found || (pos->value <= chr && chr <= pos[1].value);
|
|
13642
13672
|
pos += 2;
|
|
13673
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
|
13674
|
+
// Any character matches "."
|
|
13675
|
+
found = true;
|
|
13676
|
+
pos += 1;
|
|
13643
13677
|
} else {
|
|
13644
13678
|
// exact char match, e.g. [a] or "a"
|
|
13645
13679
|
found = found || pos->value == chr;
|
|
@@ -13657,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
|
|
|
13657
13691
|
const llama_grammar_element * pos,
|
|
13658
13692
|
const llama_partial_utf8 partial_utf8) {
|
|
13659
13693
|
|
|
13660
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
13694
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
|
13661
13695
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
|
13662
13696
|
|
|
13663
13697
|
uint32_t partial_value = partial_utf8.value;
|
|
@@ -13687,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
|
|
|
13687
13721
|
return is_positive_char;
|
|
13688
13722
|
}
|
|
13689
13723
|
pos += 2;
|
|
13724
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
|
13725
|
+
// Any character matches "."
|
|
13726
|
+
return true;
|
|
13690
13727
|
} else {
|
|
13691
13728
|
// exact char match, e.g. [a] or "a"
|
|
13692
13729
|
if (low <= pos->value && pos->value <= high) {
|
|
@@ -13747,6 +13784,7 @@ static void llama_grammar_advance_stack(
|
|
|
13747
13784
|
}
|
|
13748
13785
|
case LLAMA_GRETYPE_CHAR:
|
|
13749
13786
|
case LLAMA_GRETYPE_CHAR_NOT:
|
|
13787
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
|
13750
13788
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
|
13751
13789
|
// only add the stack if it's not a duplicate of one we already have
|
|
13752
13790
|
new_stacks.emplace_back(stack);
|
|
@@ -15220,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
15220
15258
|
if (imatrix_data) {
|
|
15221
15259
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
|
15222
15260
|
qs.has_imatrix = true;
|
|
15261
|
+
// check imatrix for nans or infs
|
|
15262
|
+
for (const auto & kv : *imatrix_data) {
|
|
15263
|
+
for (float f : kv.second) {
|
|
15264
|
+
if (!std::isfinite(f)) {
|
|
15265
|
+
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
|
|
15266
|
+
}
|
|
15267
|
+
}
|
|
15268
|
+
}
|
|
15223
15269
|
}
|
|
15224
15270
|
}
|
|
15225
15271
|
|
|
@@ -16226,6 +16272,16 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16226
16272
|
ctx->backends.push_back(backend);
|
|
16227
16273
|
}
|
|
16228
16274
|
#endif
|
|
16275
|
+
|
|
16276
|
+
#ifdef GGML_USE_BLAS
|
|
16277
|
+
ctx->backend_blas = ggml_backend_blas_init();
|
|
16278
|
+
if (ctx->backend_blas == nullptr) {
|
|
16279
|
+
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
|
16280
|
+
} else {
|
|
16281
|
+
ctx->backends.push_back(ctx->backend_blas);
|
|
16282
|
+
}
|
|
16283
|
+
#endif
|
|
16284
|
+
|
|
16229
16285
|
#if defined(GGML_USE_RPC)
|
|
16230
16286
|
if (model->n_gpu_layers > 0) {
|
|
16231
16287
|
for (const auto & endpoint : model->rpc_servers) {
|
|
@@ -86,6 +86,7 @@ extern "C" {
|
|
|
86
86
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
87
87
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
88
88
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
89
|
+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
89
90
|
};
|
|
90
91
|
|
|
91
92
|
// note: these values should be synchronized with ggml_rope
|
|
@@ -365,6 +366,9 @@ extern "C" {
|
|
|
365
366
|
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
|
366
367
|
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
|
367
368
|
LLAMA_GRETYPE_CHAR_ALT = 6,
|
|
369
|
+
|
|
370
|
+
// any character (.)
|
|
371
|
+
LLAMA_GRETYPE_CHAR_ANY = 7,
|
|
368
372
|
};
|
|
369
373
|
|
|
370
374
|
typedef struct llama_grammar_element {
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llama_cpp
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.16.
|
|
4
|
+
version: 0.16.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- yoshoku
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-06-
|
|
11
|
+
date: 2024-06-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
|
14
14
|
email:
|
|
@@ -42,6 +42,8 @@ files:
|
|
|
42
42
|
- vendor/tmp/llama.cpp/ggml-backend-impl.h
|
|
43
43
|
- vendor/tmp/llama.cpp/ggml-backend.c
|
|
44
44
|
- vendor/tmp/llama.cpp/ggml-backend.h
|
|
45
|
+
- vendor/tmp/llama.cpp/ggml-blas.cpp
|
|
46
|
+
- vendor/tmp/llama.cpp/ggml-blas.h
|
|
45
47
|
- vendor/tmp/llama.cpp/ggml-common.h
|
|
46
48
|
- vendor/tmp/llama.cpp/ggml-cuda.cu
|
|
47
49
|
- vendor/tmp/llama.cpp/ggml-cuda.h
|
|
@@ -161,6 +163,16 @@ files:
|
|
|
161
163
|
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
|
|
162
164
|
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
|
|
163
165
|
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
|
|
166
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu
|
|
167
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu
|
|
168
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu
|
|
169
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu
|
|
170
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu
|
|
171
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu
|
|
172
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu
|
|
173
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu
|
|
174
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu
|
|
175
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu
|
|
164
176
|
- vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu
|
|
165
177
|
- vendor/tmp/llama.cpp/ggml-cuda/unary.cu
|
|
166
178
|
- vendor/tmp/llama.cpp/ggml-cuda/upscale.cu
|
|
@@ -214,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
214
226
|
- !ruby/object:Gem::Version
|
|
215
227
|
version: '0'
|
|
216
228
|
requirements: []
|
|
217
|
-
rubygems_version: 3.5.
|
|
229
|
+
rubygems_version: 3.5.9
|
|
218
230
|
signing_key:
|
|
219
231
|
specification_version: 4
|
|
220
232
|
summary: Ruby bindings for the llama.cpp.
|