llama_cpp 0.15.4 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
|
|
13
13
|
|
14
14
|
#ifdef GGML_USE_CUDA
|
15
15
|
# include "ggml-cuda.h"
|
16
|
-
#elif defined(GGML_USE_CLBLAST)
|
17
|
-
# include "ggml-opencl.h"
|
18
16
|
#elif defined(GGML_USE_VULKAN)
|
19
17
|
# include "ggml-vulkan.h"
|
20
18
|
#elif defined(GGML_USE_SYCL)
|
@@ -23,6 +21,10 @@
|
|
23
21
|
# include "ggml-kompute.h"
|
24
22
|
#endif
|
25
23
|
|
24
|
+
#ifdef GGML_USE_BLAS
|
25
|
+
# include "ggml-blas.h"
|
26
|
+
#endif
|
27
|
+
|
26
28
|
#ifdef GGML_USE_METAL
|
27
29
|
# include "ggml-metal.h"
|
28
30
|
#endif
|
@@ -110,7 +112,7 @@
|
|
110
112
|
//
|
111
113
|
|
112
114
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
113
|
-
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
115
|
+
static void llama_log_internal (ggml_log_level level, const char * format, ...);
|
114
116
|
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
115
117
|
|
116
118
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
@@ -706,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
706
708
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
707
709
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
708
710
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
711
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
709
712
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
710
713
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
711
714
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
@@ -1850,7 +1853,7 @@ struct llama_hparams {
|
|
1850
1853
|
float rope_attn_factor = 1.0f;
|
1851
1854
|
float rope_freq_base_train;
|
1852
1855
|
float rope_freq_scale_train;
|
1853
|
-
uint32_t
|
1856
|
+
uint32_t n_ctx_orig_yarn;
|
1854
1857
|
float rope_yarn_log_mul;
|
1855
1858
|
|
1856
1859
|
// for State Space Models
|
@@ -1892,7 +1895,7 @@ struct llama_hparams {
|
|
1892
1895
|
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1893
1896
|
|
1894
1897
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1895
|
-
if (this->
|
1898
|
+
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
1896
1899
|
|
1897
1900
|
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
1898
1901
|
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
@@ -1951,7 +1954,7 @@ struct llama_cparams {
|
|
1951
1954
|
float rope_freq_base;
|
1952
1955
|
float rope_freq_scale;
|
1953
1956
|
|
1954
|
-
uint32_t
|
1957
|
+
uint32_t n_ctx_orig_yarn;
|
1955
1958
|
// These hyperparameters are not exposed in GGUF, because all
|
1956
1959
|
// existing YaRN models use the same values for them.
|
1957
1960
|
float yarn_ext_factor;
|
@@ -2149,12 +2152,12 @@ struct llama_control_vector {
|
|
2149
2152
|
struct llama_vocab {
|
2150
2153
|
using id = int32_t;
|
2151
2154
|
using token = std::string;
|
2152
|
-
using
|
2155
|
+
using tattr = llama_token_attr;
|
2153
2156
|
|
2154
2157
|
struct token_data {
|
2155
2158
|
token text;
|
2156
2159
|
float score;
|
2157
|
-
|
2160
|
+
tattr attr;
|
2158
2161
|
};
|
2159
2162
|
|
2160
2163
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
@@ -2164,8 +2167,7 @@ struct llama_vocab {
|
|
2164
2167
|
std::vector<token_data> id_to_token;
|
2165
2168
|
|
2166
2169
|
std::vector<id> cache_special_tokens;
|
2167
|
-
std::vector<token> cache_token_to_piece;
|
2168
|
-
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
2170
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
2169
2171
|
|
2170
2172
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2171
2173
|
|
@@ -2301,9 +2303,13 @@ struct llama_context {
|
|
2301
2303
|
std::vector<ggml_backend_t> backends;
|
2302
2304
|
#ifdef GGML_USE_METAL
|
2303
2305
|
ggml_backend_t backend_metal = nullptr;
|
2306
|
+
#endif
|
2307
|
+
#ifdef GGML_USE_BLAS
|
2308
|
+
ggml_backend_t backend_blas = nullptr;
|
2304
2309
|
#endif
|
2305
2310
|
ggml_backend_t backend_cpu = nullptr;
|
2306
2311
|
|
2312
|
+
|
2307
2313
|
const llama_model & model;
|
2308
2314
|
|
2309
2315
|
// key + value cache for the self attention
|
@@ -2372,13 +2378,34 @@ struct llama_context {
|
|
2372
2378
|
struct llama_control_vector cvec;
|
2373
2379
|
};
|
2374
2380
|
|
2381
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
2382
|
+
size_t count = 1;
|
2383
|
+
#if defined(GGML_USE_CUDA)
|
2384
|
+
count = ggml_backend_cuda_get_device_count();
|
2385
|
+
#elif defined(GGML_USE_SYCL)
|
2386
|
+
count = ggml_backend_sycl_get_device_count();
|
2387
|
+
#elif defined(GGML_USE_VULKAN)
|
2388
|
+
count = ggml_backend_vk_get_device_count();
|
2389
|
+
#endif
|
2390
|
+
#if defined(GGML_USE_RPC)
|
2391
|
+
count += model.rpc_servers.size();
|
2392
|
+
#endif
|
2393
|
+
return count;
|
2394
|
+
GGML_UNUSED(model);
|
2395
|
+
}
|
2396
|
+
|
2375
2397
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
2376
2398
|
ggml_backend_buffer_type_t buft = nullptr;
|
2377
2399
|
|
2378
|
-
#
|
2379
|
-
|
2380
|
-
|
2381
|
-
|
2400
|
+
#if defined(GGML_USE_RPC)
|
2401
|
+
int dev_count = (int)llama_get_device_count(model);
|
2402
|
+
int rpc_count = (int)model.rpc_servers.size();
|
2403
|
+
if (gpu >= dev_count - rpc_count) {
|
2404
|
+
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
2405
|
+
return ggml_backend_rpc_buffer_type(endpoint);
|
2406
|
+
}
|
2407
|
+
#endif
|
2408
|
+
#if defined(GGML_USE_METAL)
|
2382
2409
|
buft = ggml_backend_metal_buffer_type();
|
2383
2410
|
#elif defined(GGML_USE_CUDA)
|
2384
2411
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
@@ -2386,8 +2413,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|
2386
2413
|
buft = ggml_backend_vk_buffer_type(gpu);
|
2387
2414
|
#elif defined(GGML_USE_SYCL)
|
2388
2415
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
2389
|
-
#elif defined(GGML_USE_CLBLAST)
|
2390
|
-
buft = ggml_backend_opencl_buffer_type();
|
2391
2416
|
#elif defined(GGML_USE_KOMPUTE)
|
2392
2417
|
buft = ggml_backend_kompute_buffer_type(gpu);
|
2393
2418
|
if (buft == nullptr) {
|
@@ -2426,29 +2451,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|
2426
2451
|
GGML_UNUSED(tensor_split);
|
2427
2452
|
}
|
2428
2453
|
|
2429
|
-
static size_t llama_get_device_count(const llama_model & model) {
|
2430
|
-
#if defined(GGML_USE_RPC)
|
2431
|
-
return model.rpc_servers.size();
|
2432
|
-
#elif defined(GGML_USE_CUDA)
|
2433
|
-
return ggml_backend_cuda_get_device_count();
|
2434
|
-
#elif defined(GGML_USE_SYCL)
|
2435
|
-
return ggml_backend_sycl_get_device_count();
|
2436
|
-
#elif defined(GGML_USE_VULKAN)
|
2437
|
-
return ggml_backend_vk_get_device_count();
|
2438
|
-
#else
|
2439
|
-
return 1;
|
2440
|
-
#endif
|
2441
|
-
GGML_UNUSED(model);
|
2442
|
-
}
|
2443
|
-
|
2444
2454
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
2445
2455
|
#if defined(GGML_USE_RPC)
|
2446
|
-
|
2447
|
-
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2451
|
-
|
2456
|
+
int dev_count = (int)llama_get_device_count(model);
|
2457
|
+
int rpc_count = (int)model.rpc_servers.size();
|
2458
|
+
if (device >= dev_count - rpc_count) {
|
2459
|
+
size_t total;
|
2460
|
+
size_t free;
|
2461
|
+
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
2462
|
+
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
2463
|
+
return free;
|
2464
|
+
}
|
2465
|
+
#endif
|
2466
|
+
#if defined(GGML_USE_CUDA)
|
2452
2467
|
size_t total;
|
2453
2468
|
size_t free;
|
2454
2469
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
@@ -2520,10 +2535,6 @@ static bool llama_kv_cache_init(
|
|
2520
2535
|
}
|
2521
2536
|
}
|
2522
2537
|
|
2523
|
-
#ifdef GGML_USE_CLBLAST
|
2524
|
-
offload = false;
|
2525
|
-
#endif
|
2526
|
-
|
2527
2538
|
// count used buffer types
|
2528
2539
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
2529
2540
|
if (offload) {
|
@@ -4003,8 +4014,8 @@ static void llm_load_hparams(
|
|
4003
4014
|
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
4004
4015
|
hparams.rope_finetuned = rope_finetuned;
|
4005
4016
|
|
4006
|
-
hparams.
|
4007
|
-
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.
|
4017
|
+
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
4018
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
4008
4019
|
|
4009
4020
|
// rope_freq_base (optional)
|
4010
4021
|
hparams.rope_freq_base_train = 10000.0f;
|
@@ -4550,35 +4561,6 @@ static void llm_load_vocab(
|
|
4550
4561
|
vocab.special_cls_id = -1;
|
4551
4562
|
vocab.special_mask_id = -1;
|
4552
4563
|
|
4553
|
-
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4554
|
-
// prior to support of FIM special tokens in GGUF, the following
|
4555
|
-
// will allow those models to continue to work. The general names
|
4556
|
-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4557
|
-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4558
|
-
// new versions of these models have been published.
|
4559
|
-
std::string gen_name;
|
4560
|
-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4561
|
-
|
4562
|
-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4563
|
-
[](unsigned char c){ return std::tolower(c); });
|
4564
|
-
|
4565
|
-
if (gen_name.find("code") != std::string::npos) {
|
4566
|
-
if (model.arch == LLM_ARCH_LLAMA) {
|
4567
|
-
vocab.special_prefix_id = 32007;
|
4568
|
-
vocab.special_suffix_id = 32008;
|
4569
|
-
vocab.special_middle_id = 32009;
|
4570
|
-
vocab.special_eot_id = 32010;
|
4571
|
-
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4572
|
-
vocab.special_prefix_id = 67;
|
4573
|
-
vocab.special_suffix_id = 69;
|
4574
|
-
vocab.special_middle_id = 68;
|
4575
|
-
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4576
|
-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4577
|
-
//vocab.special_eot_id = 70;
|
4578
|
-
vocab.special_eot_id = 107;
|
4579
|
-
}
|
4580
|
-
}
|
4581
|
-
|
4582
4564
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4583
4565
|
if (add_space_prefix_keyidx != -1) {
|
4584
4566
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
@@ -4651,8 +4633,7 @@ static void llm_load_vocab(
|
|
4651
4633
|
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4652
4634
|
LLAMA_LOG_WARN("%s: \n", __func__);
|
4653
4635
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4654
|
-
} else if (
|
4655
|
-
tokenizer_pre == "default") {
|
4636
|
+
} else if (tokenizer_pre == "default") {
|
4656
4637
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4657
4638
|
} else if (
|
4658
4639
|
tokenizer_pre == "llama3" ||
|
@@ -4679,7 +4660,8 @@ static void llm_load_vocab(
|
|
4679
4660
|
tokenizer_pre == "jina-es" ||
|
4680
4661
|
tokenizer_pre == "jina-de" ||
|
4681
4662
|
tokenizer_pre == "jina-v2-es" ||
|
4682
|
-
tokenizer_pre == "jina-v2-de"
|
4663
|
+
tokenizer_pre == "jina-v2-de" ||
|
4664
|
+
tokenizer_pre == "jina-v2-code") {
|
4683
4665
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4684
4666
|
} else if (
|
4685
4667
|
tokenizer_pre == "refact") {
|
@@ -4702,6 +4684,9 @@ static void llm_load_vocab(
|
|
4702
4684
|
} else if (
|
4703
4685
|
tokenizer_pre == "smaug-bpe") {
|
4704
4686
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4687
|
+
} else if (
|
4688
|
+
tokenizer_pre == "poro-chat") {
|
4689
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
4705
4690
|
} else {
|
4706
4691
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4707
4692
|
}
|
@@ -4740,12 +4725,64 @@ static void llm_load_vocab(
|
|
4740
4725
|
auto & token_data = vocab.id_to_token[i];
|
4741
4726
|
token_data.text = std::move(word);
|
4742
4727
|
token_data.score = scores ? scores[i] : 0.0f;
|
4743
|
-
token_data.
|
4728
|
+
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
4729
|
+
|
4730
|
+
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
4731
|
+
switch(toktypes[i]) {
|
4732
|
+
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
4733
|
+
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
4734
|
+
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
4735
|
+
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
4736
|
+
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
4737
|
+
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
4738
|
+
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
4739
|
+
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
4740
|
+
}
|
4741
|
+
}
|
4744
4742
|
}
|
4745
4743
|
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
4746
4744
|
|
4747
4745
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
4748
4746
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
4747
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4748
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4749
|
+
// will allow those models to continue to work. The general names
|
4750
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4751
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4752
|
+
// new versions of these models have been published.
|
4753
|
+
std::string gen_name;
|
4754
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4755
|
+
|
4756
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4757
|
+
[](unsigned char c){ return std::tolower(c); });
|
4758
|
+
|
4759
|
+
if (gen_name.find("code") != std::string::npos) {
|
4760
|
+
if (model.arch == LLM_ARCH_LLAMA
|
4761
|
+
&& 32010 < vocab.id_to_token.size()
|
4762
|
+
&& vocab.id_to_token[32007].text == "<PRE>"
|
4763
|
+
&& vocab.id_to_token[32008].text == "<SUF>"
|
4764
|
+
&& vocab.id_to_token[32009].text == "<MID>"
|
4765
|
+
&& vocab.id_to_token[32010].text == "<EOT>") {
|
4766
|
+
vocab.special_prefix_id = 32007;
|
4767
|
+
vocab.special_suffix_id = 32008;
|
4768
|
+
vocab.special_middle_id = 32009;
|
4769
|
+
vocab.special_eot_id = 32010;
|
4770
|
+
} else if (model.arch == LLM_ARCH_GEMMA
|
4771
|
+
&& 107 < vocab.id_to_token.size()
|
4772
|
+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
4773
|
+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
4774
|
+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
4775
|
+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
4776
|
+
vocab.special_prefix_id = 67;
|
4777
|
+
vocab.special_suffix_id = 69;
|
4778
|
+
vocab.special_middle_id = 68;
|
4779
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4780
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4781
|
+
//vocab.special_eot_id = 70;
|
4782
|
+
vocab.special_eot_id = 107;
|
4783
|
+
}
|
4784
|
+
}
|
4785
|
+
|
4749
4786
|
try {
|
4750
4787
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
4751
4788
|
} catch (const std::exception & e) {
|
@@ -4831,7 +4868,7 @@ static void llm_load_vocab(
|
|
4831
4868
|
// build special tokens cache
|
4832
4869
|
{
|
4833
4870
|
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
4834
|
-
if (vocab.id_to_token[id].
|
4871
|
+
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
4835
4872
|
vocab.cache_special_tokens.push_back(id);
|
4836
4873
|
}
|
4837
4874
|
}
|
@@ -4845,26 +4882,75 @@ static void llm_load_vocab(
|
|
4845
4882
|
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
4846
4883
|
}
|
4847
4884
|
|
4848
|
-
// build token to piece
|
4885
|
+
// build token to piece cache
|
4849
4886
|
{
|
4850
4887
|
size_t size_cache = 0;
|
4851
4888
|
|
4852
|
-
std::vector<llama_vocab::token> cache_token_to_piece
|
4853
|
-
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
4889
|
+
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
|
4854
4890
|
|
4855
4891
|
for (uint32_t id = 0; id < n_vocab; ++id) {
|
4856
|
-
cache_token_to_piece[id]
|
4857
|
-
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
4892
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
|
4858
4893
|
|
4859
4894
|
size_cache += cache_token_to_piece[id].size();
|
4860
|
-
size_cache += cache_token_to_piece_special[id].size();
|
4861
4895
|
}
|
4862
4896
|
|
4863
|
-
std::swap(vocab.cache_token_to_piece,
|
4864
|
-
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
4897
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
4865
4898
|
|
4866
4899
|
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
4867
4900
|
}
|
4901
|
+
|
4902
|
+
// Handle per token attributes
|
4903
|
+
//NOTE: Each model customizes per token attributes.
|
4904
|
+
//NOTE: Per token attributes are missing from the GGUF file.
|
4905
|
+
//TODO: Extract attributes from GGUF file.
|
4906
|
+
{
|
4907
|
+
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
4908
|
+
for (auto substr : substrs) {
|
4909
|
+
if (str.find(substr) < std::string::npos) {
|
4910
|
+
return true;
|
4911
|
+
}
|
4912
|
+
}
|
4913
|
+
return false;
|
4914
|
+
};
|
4915
|
+
|
4916
|
+
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
4917
|
+
uint32_t current = vocab.id_to_token.at(id).attr;
|
4918
|
+
current = value ? (current | attr) : (current & ~attr);
|
4919
|
+
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
4920
|
+
};
|
4921
|
+
|
4922
|
+
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
4923
|
+
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
4924
|
+
};
|
4925
|
+
|
4926
|
+
std::string model_name;
|
4927
|
+
std::string tokenizer_pre;
|
4928
|
+
|
4929
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
4930
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4931
|
+
|
4932
|
+
// model name to lowercase
|
4933
|
+
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
4934
|
+
[] (const std::string::value_type x) {
|
4935
|
+
return std::tolower(x);
|
4936
|
+
}
|
4937
|
+
);
|
4938
|
+
|
4939
|
+
// set attributes by model/tokenizer name
|
4940
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
4941
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
4942
|
+
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
4943
|
+
for (auto id : vocab.cache_special_tokens) {
|
4944
|
+
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
4945
|
+
}
|
4946
|
+
for (auto token : {"</s>"}) {
|
4947
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
4948
|
+
}
|
4949
|
+
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
4950
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
4951
|
+
}
|
4952
|
+
}
|
4953
|
+
}
|
4868
4954
|
}
|
4869
4955
|
|
4870
4956
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
@@ -4904,7 +4990,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4904
4990
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
4905
4991
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
4906
4992
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
4907
|
-
LLAMA_LOG_INFO("%s:
|
4993
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
4908
4994
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
4909
4995
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
4910
4996
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
@@ -5129,12 +5215,10 @@ static bool llm_load_tensors(
|
|
5129
5215
|
// output
|
5130
5216
|
{
|
5131
5217
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5132
|
-
|
5133
|
-
|
5134
|
-
|
5135
|
-
|
5136
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5137
|
-
}
|
5218
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5219
|
+
// if output is NULL, init from the input tok embed
|
5220
|
+
if (model.output == NULL) {
|
5221
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5138
5222
|
}
|
5139
5223
|
}
|
5140
5224
|
|
@@ -5453,7 +5537,7 @@ static bool llm_load_tensors(
|
|
5453
5537
|
|
5454
5538
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5455
5539
|
} else {
|
5456
|
-
layer.ffn_gate
|
5540
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5457
5541
|
}
|
5458
5542
|
|
5459
5543
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
@@ -5494,6 +5578,9 @@ static bool llm_load_tensors(
|
|
5494
5578
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5495
5579
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5496
5580
|
|
5581
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5582
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5583
|
+
|
5497
5584
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5498
5585
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5499
5586
|
|
@@ -7072,7 +7159,7 @@ struct llm_build_context {
|
|
7072
7159
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
7073
7160
|
const int32_t n_outputs;
|
7074
7161
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
7075
|
-
const int32_t
|
7162
|
+
const int32_t n_ctx_orig;
|
7076
7163
|
|
7077
7164
|
const bool flash_attn;
|
7078
7165
|
|
@@ -7121,7 +7208,7 @@ struct llm_build_context {
|
|
7121
7208
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
7122
7209
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
7123
7210
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
7124
|
-
|
7211
|
+
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
7125
7212
|
flash_attn (cparams.flash_attn),
|
7126
7213
|
pooling_type (cparams.pooling_type),
|
7127
7214
|
rope_type (hparams.rope_type),
|
@@ -7179,7 +7266,7 @@ struct llm_build_context {
|
|
7179
7266
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
7180
7267
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
7181
7268
|
0),
|
7182
|
-
lctx.inp_K_shift, rope_factors, n_rot, rope_type,
|
7269
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7183
7270
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7184
7271
|
|
7185
7272
|
cb(tmp, "K_shifted", il);
|
@@ -7288,7 +7375,7 @@ struct llm_build_context {
|
|
7288
7375
|
// choose long/short freq factors based on the context size
|
7289
7376
|
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7290
7377
|
|
7291
|
-
if (n_ctx_pre_seq > hparams.
|
7378
|
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
7292
7379
|
return model.layers[il].rope_long;
|
7293
7380
|
}
|
7294
7381
|
|
@@ -7404,14 +7491,14 @@ struct llm_build_context {
|
|
7404
7491
|
|
7405
7492
|
Qcur = ggml_rope_ext(
|
7406
7493
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7407
|
-
n_rot, rope_type,
|
7494
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7408
7495
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7409
7496
|
);
|
7410
7497
|
cb(Qcur, "Qcur", il);
|
7411
7498
|
|
7412
7499
|
Kcur = ggml_rope_ext(
|
7413
7500
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7414
|
-
n_rot, rope_type,
|
7501
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7415
7502
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7416
7503
|
);
|
7417
7504
|
cb(Kcur, "Kcur", il);
|
@@ -7535,12 +7622,12 @@ struct llm_build_context {
|
|
7535
7622
|
case MODEL_7B:
|
7536
7623
|
Qcur = ggml_rope_ext(
|
7537
7624
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7538
|
-
n_rot, rope_type,
|
7625
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7539
7626
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7540
7627
|
);
|
7541
7628
|
Kcur = ggml_rope_ext(
|
7542
7629
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7543
|
-
n_rot, rope_type,
|
7630
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7544
7631
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7545
7632
|
);
|
7546
7633
|
break;
|
@@ -7647,14 +7734,14 @@ struct llm_build_context {
|
|
7647
7734
|
|
7648
7735
|
Qcur = ggml_rope_ext(
|
7649
7736
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7650
|
-
n_rot, rope_type,
|
7737
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7651
7738
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7652
7739
|
);
|
7653
7740
|
cb(Qcur, "Qcur", il);
|
7654
7741
|
|
7655
7742
|
Kcur = ggml_rope_ext(
|
7656
7743
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7657
|
-
n_rot, rope_type,
|
7744
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7658
7745
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7659
7746
|
);
|
7660
7747
|
cb(Kcur, "Kcur", il);
|
@@ -7767,13 +7854,13 @@ struct llm_build_context {
|
|
7767
7854
|
|
7768
7855
|
// using mode = 2 for neox mode
|
7769
7856
|
Qcur = ggml_rope_ext(
|
7770
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
7857
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
7771
7858
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7772
7859
|
);
|
7773
7860
|
cb(Qcur, "Qcur", il);
|
7774
7861
|
|
7775
7862
|
Kcur = ggml_rope_ext(
|
7776
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
7863
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
7777
7864
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7778
7865
|
);
|
7779
7866
|
cb(Kcur, "Kcur", il);
|
@@ -7891,14 +7978,14 @@ struct llm_build_context {
|
|
7891
7978
|
|
7892
7979
|
Qcur = ggml_rope_ext(
|
7893
7980
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7894
|
-
n_rot, rope_type,
|
7981
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7895
7982
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7896
7983
|
);
|
7897
7984
|
cb(Qcur, "Qcur", il);
|
7898
7985
|
|
7899
7986
|
Kcur = ggml_rope_ext(
|
7900
7987
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7901
|
-
n_rot, rope_type,
|
7988
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7902
7989
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7903
7990
|
);
|
7904
7991
|
cb(Kcur, "Kcur", il);
|
@@ -8044,14 +8131,14 @@ struct llm_build_context {
|
|
8044
8131
|
|
8045
8132
|
Qcur = ggml_rope_ext(
|
8046
8133
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8047
|
-
n_rot, rope_type,
|
8134
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8048
8135
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8049
8136
|
);
|
8050
8137
|
cb(Qcur, "Qcur", il);
|
8051
8138
|
|
8052
8139
|
Kcur = ggml_rope_ext(
|
8053
8140
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8054
|
-
n_rot, rope_type,
|
8141
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8055
8142
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8056
8143
|
);
|
8057
8144
|
cb(Kcur, "Kcur", il);
|
@@ -8398,14 +8485,14 @@ struct llm_build_context {
|
|
8398
8485
|
|
8399
8486
|
Qcur = ggml_rope_ext(
|
8400
8487
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8401
|
-
n_rot, rope_type,
|
8488
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8402
8489
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8403
8490
|
);
|
8404
8491
|
cb(Qcur, "Qcur", il);
|
8405
8492
|
|
8406
8493
|
Kcur = ggml_rope_ext(
|
8407
8494
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8408
|
-
n_rot, rope_type,
|
8495
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8409
8496
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8410
8497
|
);
|
8411
8498
|
cb(Kcur, "Kcur", il);
|
@@ -8457,6 +8544,11 @@ struct llm_build_context {
|
|
8457
8544
|
// attention layer norm
|
8458
8545
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
8459
8546
|
|
8547
|
+
if (model.layers[il].attn_norm_2 != nullptr) {
|
8548
|
+
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
|
8549
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
|
8550
|
+
}
|
8551
|
+
|
8460
8552
|
struct ggml_tensor * ffn_inp = cur;
|
8461
8553
|
cb(ffn_inp, "ffn_inp", il);
|
8462
8554
|
|
@@ -8838,14 +8930,14 @@ struct llm_build_context {
|
|
8838
8930
|
|
8839
8931
|
Qcur = ggml_rope_ext(
|
8840
8932
|
ctx0, Qcur, inp_pos, nullptr,
|
8841
|
-
n_rot, rope_type,
|
8933
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8842
8934
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8843
8935
|
);
|
8844
8936
|
cb(Qcur, "Qcur", il);
|
8845
8937
|
|
8846
8938
|
Kcur = ggml_rope_ext(
|
8847
8939
|
ctx0, Kcur, inp_pos, nullptr,
|
8848
|
-
n_rot, rope_type,
|
8940
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8849
8941
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8850
8942
|
);
|
8851
8943
|
cb(Kcur, "Kcur", il);
|
@@ -8957,13 +9049,13 @@ struct llm_build_context {
|
|
8957
9049
|
|
8958
9050
|
// using mode = 2 for neox mode
|
8959
9051
|
Qcur = ggml_rope_ext(
|
8960
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
9052
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
8961
9053
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8962
9054
|
);
|
8963
9055
|
cb(Qcur, "Qcur", il);
|
8964
9056
|
|
8965
9057
|
Kcur = ggml_rope_ext(
|
8966
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
9058
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
8967
9059
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8968
9060
|
);
|
8969
9061
|
cb(Kcur, "Kcur", il);
|
@@ -9069,14 +9161,14 @@ struct llm_build_context {
|
|
9069
9161
|
|
9070
9162
|
Qcur = ggml_rope_ext(
|
9071
9163
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9072
|
-
n_rot, rope_type,
|
9164
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9073
9165
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9074
9166
|
);
|
9075
9167
|
cb(Qcur, "Qcur", il);
|
9076
9168
|
|
9077
9169
|
Kcur = ggml_rope_ext(
|
9078
9170
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9079
|
-
n_rot, rope_type,
|
9171
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9080
9172
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9081
9173
|
);
|
9082
9174
|
cb(Kcur, "Kcur", il);
|
@@ -9183,14 +9275,14 @@ struct llm_build_context {
|
|
9183
9275
|
|
9184
9276
|
Qcur = ggml_rope_ext(
|
9185
9277
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9186
|
-
n_rot, rope_type,
|
9278
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9187
9279
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9188
9280
|
);
|
9189
9281
|
cb(Qcur, "Qcur", il);
|
9190
9282
|
|
9191
9283
|
Kcur = ggml_rope_ext(
|
9192
9284
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9193
|
-
n_rot, rope_type,
|
9285
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9194
9286
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9195
9287
|
);
|
9196
9288
|
cb(Kcur, "Kcur", il);
|
@@ -9335,7 +9427,7 @@ struct llm_build_context {
|
|
9335
9427
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9336
9428
|
|
9337
9429
|
Qcur = ggml_rope_ext(
|
9338
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
9430
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
9339
9431
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9340
9432
|
);
|
9341
9433
|
cb(Qcur, "Qcur", il);
|
@@ -9346,7 +9438,7 @@ struct llm_build_context {
|
|
9346
9438
|
cb(Qcur, "Qcur", il);
|
9347
9439
|
|
9348
9440
|
Kcur = ggml_rope_ext(
|
9349
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
9441
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
9350
9442
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9351
9443
|
);
|
9352
9444
|
cb(Kcur, "Kcur", il);
|
@@ -9457,7 +9549,7 @@ struct llm_build_context {
|
|
9457
9549
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9458
9550
|
|
9459
9551
|
Qcur = ggml_rope_ext(
|
9460
|
-
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type,
|
9552
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
9461
9553
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9462
9554
|
);
|
9463
9555
|
cb(Qcur, "Qcur", il);
|
@@ -9466,7 +9558,7 @@ struct llm_build_context {
|
|
9466
9558
|
cb(Qcur, "Qcur", il);
|
9467
9559
|
|
9468
9560
|
Kcur = ggml_rope_ext(
|
9469
|
-
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type,
|
9561
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
9470
9562
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9471
9563
|
);
|
9472
9564
|
cb(Kcur, "Kcur", il);
|
@@ -9574,13 +9666,13 @@ struct llm_build_context {
|
|
9574
9666
|
|
9575
9667
|
Qcur = ggml_rope_ext(
|
9576
9668
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9577
|
-
n_embd_head, rope_type,
|
9669
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9578
9670
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9579
9671
|
cb(Qcur, "Qcur", il);
|
9580
9672
|
|
9581
9673
|
Kcur = ggml_rope_ext(
|
9582
9674
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9583
|
-
n_embd_head, rope_type,
|
9675
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9584
9676
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9585
9677
|
cb(Kcur, "Kcur", il);
|
9586
9678
|
|
@@ -9782,14 +9874,14 @@ struct llm_build_context {
|
|
9782
9874
|
|
9783
9875
|
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9784
9876
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9785
|
-
n_rot, rope_type,
|
9877
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9786
9878
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9787
9879
|
);
|
9788
9880
|
cb(Qcur, "Qcur", il);
|
9789
9881
|
|
9790
9882
|
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9791
9883
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9792
|
-
n_rot, rope_type,
|
9884
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9793
9885
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9794
9886
|
);
|
9795
9887
|
cb(Kcur, "Kcur", il);
|
@@ -9898,14 +9990,14 @@ struct llm_build_context {
|
|
9898
9990
|
|
9899
9991
|
Qcur = ggml_rope_ext(
|
9900
9992
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9901
|
-
n_rot, rope_type,
|
9993
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9902
9994
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9903
9995
|
);
|
9904
9996
|
cb(Qcur, "Qcur", il);
|
9905
9997
|
|
9906
9998
|
Kcur = ggml_rope_ext(
|
9907
9999
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9908
|
-
n_rot, rope_type,
|
10000
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9909
10001
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9910
10002
|
);
|
9911
10003
|
cb(Kcur, "Kcur", il);
|
@@ -10015,14 +10107,14 @@ struct llm_build_context {
|
|
10015
10107
|
|
10016
10108
|
Qcur = ggml_rope_ext(
|
10017
10109
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10018
|
-
n_rot, rope_type,
|
10110
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10019
10111
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10020
10112
|
);
|
10021
10113
|
cb(Qcur, "Qcur", il);
|
10022
10114
|
|
10023
10115
|
Kcur = ggml_rope_ext(
|
10024
10116
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10025
|
-
n_rot, rope_type,
|
10117
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10026
10118
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10027
10119
|
);
|
10028
10120
|
cb(Kcur, "Kcur", il);
|
@@ -10145,14 +10237,14 @@ struct llm_build_context {
|
|
10145
10237
|
|
10146
10238
|
Qcur = ggml_rope_ext(
|
10147
10239
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10148
|
-
n_rot, rope_type,
|
10240
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10149
10241
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10150
10242
|
);
|
10151
10243
|
cb(Qcur, "Qcur", il);
|
10152
10244
|
|
10153
10245
|
Kcur = ggml_rope_ext(
|
10154
10246
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10155
|
-
n_rot, rope_type,
|
10247
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10156
10248
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10157
10249
|
);
|
10158
10250
|
cb(Kcur, "Kcur", il);
|
@@ -10217,7 +10309,7 @@ struct llm_build_context {
|
|
10217
10309
|
cb(cur, "lmhead_scaling", -1);
|
10218
10310
|
|
10219
10311
|
// lm_head
|
10220
|
-
cur = ggml_mul_mat(ctx0, model.
|
10312
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10221
10313
|
cb(cur, "result_output", -1);
|
10222
10314
|
|
10223
10315
|
ggml_build_forward_expand(gf, cur);
|
@@ -10265,7 +10357,7 @@ struct llm_build_context {
|
|
10265
10357
|
|
10266
10358
|
Qcur = ggml_rope_ext(
|
10267
10359
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10268
|
-
n_embd_head_k, rope_type,
|
10360
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10269
10361
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10270
10362
|
cb(Qcur, "Qcur", il);
|
10271
10363
|
|
@@ -10274,7 +10366,7 @@ struct llm_build_context {
|
|
10274
10366
|
|
10275
10367
|
Kcur = ggml_rope_ext(
|
10276
10368
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10277
|
-
n_embd_head_k, rope_type,
|
10369
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10278
10370
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10279
10371
|
cb(Kcur, "Kcur", il);
|
10280
10372
|
|
@@ -10385,14 +10477,14 @@ struct llm_build_context {
|
|
10385
10477
|
|
10386
10478
|
Qcur = ggml_rope_ext(
|
10387
10479
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10388
|
-
n_rot, rope_type,
|
10480
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10389
10481
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10390
10482
|
);
|
10391
10483
|
cb(Qcur, "Qcur", il);
|
10392
10484
|
|
10393
10485
|
Kcur = ggml_rope_ext(
|
10394
10486
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10395
|
-
n_rot, rope_type,
|
10487
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10396
10488
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10397
10489
|
);
|
10398
10490
|
cb(Kcur, "Kcur", il);
|
@@ -10675,14 +10767,14 @@ struct llm_build_context {
|
|
10675
10767
|
|
10676
10768
|
Qcur = ggml_rope_ext(
|
10677
10769
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10678
|
-
n_rot, rope_type,
|
10770
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10679
10771
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10680
10772
|
);
|
10681
10773
|
cb(Qcur, "Qcur", il);
|
10682
10774
|
|
10683
10775
|
Kcur = ggml_rope_ext(
|
10684
10776
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10685
|
-
n_rot, rope_type,
|
10777
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10686
10778
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10687
10779
|
);
|
10688
10780
|
cb(Kcur, "Kcur", il);
|
@@ -10806,14 +10898,14 @@ struct llm_build_context {
|
|
10806
10898
|
|
10807
10899
|
Qcur = ggml_rope_ext(
|
10808
10900
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10809
|
-
n_rot, rope_type,
|
10901
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10810
10902
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10811
10903
|
);
|
10812
10904
|
cb(Qcur, "Qcur", il);
|
10813
10905
|
|
10814
10906
|
Kcur = ggml_rope_ext(
|
10815
10907
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10816
|
-
n_rot, rope_type,
|
10908
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10817
10909
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10818
10910
|
);
|
10819
10911
|
cb(Kcur, "Kcur", il);
|
@@ -10920,14 +11012,14 @@ struct llm_build_context {
|
|
10920
11012
|
|
10921
11013
|
Qcur = ggml_rope_ext(
|
10922
11014
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10923
|
-
n_rot, rope_type,
|
11015
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10924
11016
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10925
11017
|
);
|
10926
11018
|
cb(Qcur, "Qcur", il);
|
10927
11019
|
|
10928
11020
|
Kcur = ggml_rope_ext(
|
10929
11021
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10930
|
-
n_rot, rope_type,
|
11022
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10931
11023
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10932
11024
|
);
|
10933
11025
|
cb(Kcur, "Kcur", il);
|
@@ -11055,14 +11147,14 @@ struct llm_build_context {
|
|
11055
11147
|
|
11056
11148
|
Qcur = ggml_rope_ext(
|
11057
11149
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
11058
|
-
n_rot, rope_type,
|
11150
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11059
11151
|
ext_factor, attn_factor, beta_fast, beta_slow
|
11060
11152
|
);
|
11061
11153
|
cb(Qcur, "Qcur", il);
|
11062
11154
|
|
11063
11155
|
Kcur = ggml_rope_ext(
|
11064
11156
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
11065
|
-
n_rot, rope_type,
|
11157
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11066
11158
|
ext_factor, attn_factor, beta_fast, beta_slow
|
11067
11159
|
);
|
11068
11160
|
cb(Kcur, "Kcur", il);
|
@@ -11272,7 +11364,7 @@ struct llm_build_context {
|
|
11272
11364
|
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11273
11365
|
q_pe = ggml_rope_ext(
|
11274
11366
|
ctx0, q_pe, inp_pos, nullptr,
|
11275
|
-
n_rot, rope_type,
|
11367
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11276
11368
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11277
11369
|
);
|
11278
11370
|
cb(q_pe, "q_pe", il);
|
@@ -11281,7 +11373,7 @@ struct llm_build_context {
|
|
11281
11373
|
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11282
11374
|
k_pe = ggml_rope_ext(
|
11283
11375
|
ctx0, k_pe, inp_pos, nullptr,
|
11284
|
-
n_rot, rope_type,
|
11376
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11285
11377
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11286
11378
|
);
|
11287
11379
|
cb(k_pe, "k_pe", il);
|
@@ -11458,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11458
11550
|
if (batch.n_tokens < 32 || full_offload) {
|
11459
11551
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
11460
11552
|
for (auto * backend : lctx.backends) {
|
11461
|
-
if (
|
11553
|
+
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
11554
|
+
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
11462
11555
|
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
11463
11556
|
break;
|
11464
11557
|
}
|
@@ -11955,6 +12048,11 @@ static void llama_graph_compute(
|
|
11955
12048
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
11956
12049
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
11957
12050
|
}
|
12051
|
+
#ifdef GGML_USE_BLAS
|
12052
|
+
if (lctx.backend_blas != nullptr) {
|
12053
|
+
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
|
12054
|
+
}
|
12055
|
+
#endif
|
11958
12056
|
|
11959
12057
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
11960
12058
|
|
@@ -12177,17 +12275,6 @@ static int llama_decode_internal(
|
|
12177
12275
|
}
|
12178
12276
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
12179
12277
|
|
12180
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
12181
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
12182
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
12183
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
12184
|
-
// with the BLAS calls. need a better solution
|
12185
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
12186
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
12187
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
12188
|
-
n_threads = std::min(4, n_threads);
|
12189
|
-
}
|
12190
|
-
|
12191
12278
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
12192
12279
|
|
12193
12280
|
llama_set_inputs(lctx, u_batch);
|
@@ -12616,27 +12703,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
12616
12703
|
|
12617
12704
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
12618
12705
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12619
|
-
return vocab.id_to_token[id].
|
12706
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
12620
12707
|
}
|
12621
12708
|
|
12622
12709
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
12623
12710
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12624
|
-
return vocab.id_to_token[id].
|
12711
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
12625
12712
|
}
|
12626
12713
|
|
12627
12714
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
12628
12715
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12629
|
-
return vocab.id_to_token[id].
|
12716
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
12630
12717
|
}
|
12631
12718
|
|
12632
12719
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
12633
12720
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12634
|
-
return vocab.id_to_token[id].
|
12721
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
12635
12722
|
}
|
12636
12723
|
|
12637
12724
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
12638
12725
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12639
|
-
return vocab.id_to_token[id].
|
12726
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
12640
12727
|
}
|
12641
12728
|
|
12642
12729
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
@@ -12954,6 +13041,11 @@ struct llm_tokenizer_bpe {
|
|
12954
13041
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12955
13042
|
});
|
12956
13043
|
break;
|
13044
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
13045
|
+
word_collection = unicode_regex_split(text, {
|
13046
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
13047
|
+
});
|
13048
|
+
break;
|
12957
13049
|
default:
|
12958
13050
|
// default regex for BPE tokenization pre-processing
|
12959
13051
|
word_collection = unicode_regex_split(text, {
|
@@ -13254,7 +13346,8 @@ struct fragment_buffer_variant {
|
|
13254
13346
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
13255
13347
|
// for each special token
|
13256
13348
|
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
13257
|
-
const auto &
|
13349
|
+
const auto & data = vocab.id_to_token[special_id];
|
13350
|
+
const auto & special_token = data.text;
|
13258
13351
|
|
13259
13352
|
// for each text fragment
|
13260
13353
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
@@ -13291,13 +13384,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
13291
13384
|
if (match > raw_text_base_offset) {
|
13292
13385
|
// left
|
13293
13386
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
13294
|
-
|
13295
|
-
|
13387
|
+
int64_t left_reminder_length = match - raw_text_base_offset;
|
13388
|
+
|
13389
|
+
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
13390
|
+
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
13391
|
+
left_reminder_length--;
|
13392
|
+
}
|
13393
|
+
}
|
13394
|
+
|
13395
|
+
if (left_reminder_length > 0) {
|
13396
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
13397
|
+
it++;
|
13398
|
+
}
|
13296
13399
|
|
13297
13400
|
#ifdef PRETOKENIZERDEBUG
|
13298
13401
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
13299
13402
|
#endif
|
13300
|
-
it++;
|
13301
13403
|
}
|
13302
13404
|
|
13303
13405
|
// special token
|
@@ -13306,16 +13408,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
13306
13408
|
|
13307
13409
|
// right
|
13308
13410
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
13309
|
-
|
13310
|
-
|
13311
|
-
|
13411
|
+
int64_t right_reminder_offset = match + special_token.length();
|
13412
|
+
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
13413
|
+
|
13414
|
+
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
13415
|
+
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
13416
|
+
right_reminder_offset++;
|
13417
|
+
right_reminder_length--;
|
13418
|
+
}
|
13419
|
+
}
|
13420
|
+
|
13421
|
+
if (right_reminder_length > 0) {
|
13422
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
13423
|
+
it++;
|
13424
|
+
}
|
13312
13425
|
|
13313
13426
|
#ifdef PRETOKENIZERDEBUG
|
13314
13427
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
13315
13428
|
#endif
|
13316
13429
|
|
13317
|
-
it++;
|
13318
|
-
|
13319
13430
|
if (source == 0) {
|
13320
13431
|
buffer.erase_after(buffer.before_begin());
|
13321
13432
|
} else {
|
@@ -13361,9 +13472,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13361
13472
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
13362
13473
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
13363
13474
|
|
13364
|
-
static const bool rtrim = true; //TODO: as param
|
13365
13475
|
bool is_prev_special = false;
|
13366
|
-
bool special_token_rtrim = false;
|
13367
13476
|
|
13368
13477
|
if (add_special && vocab.special_add_bos != 0) {
|
13369
13478
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
@@ -13373,25 +13482,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13373
13482
|
|
13374
13483
|
for (const auto & fragment : fragment_buffer) {
|
13375
13484
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13376
|
-
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
13377
|
-
|
13378
|
-
// TODO: It's likely possible to get rid of this string copy entirely
|
13379
|
-
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
13380
|
-
// and passing 'add space prefix' as bool argument
|
13381
|
-
//
|
13382
13485
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
13383
13486
|
|
13384
|
-
if (special_token_rtrim) {
|
13385
|
-
size_t num_whitespaces = 0;
|
13386
|
-
while (isspace(raw_text[num_whitespaces])) {
|
13387
|
-
num_whitespaces++;
|
13388
|
-
}
|
13389
|
-
if (num_whitespaces == raw_text.size()) {
|
13390
|
-
continue; // skip if all whitespaces
|
13391
|
-
}
|
13392
|
-
raw_text = raw_text.substr(num_whitespaces);
|
13393
|
-
}
|
13394
|
-
|
13395
13487
|
if (vocab.add_space_prefix) {
|
13396
13488
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13397
13489
|
raw_text = " " + raw_text;
|
@@ -13407,11 +13499,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13407
13499
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13408
13500
|
output.push_back(fragment.token);
|
13409
13501
|
is_prev_special = true;
|
13410
|
-
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13411
|
-
special_token_rtrim = rtrim
|
13412
|
-
&& fragment.token != vocab.special_bos_id
|
13413
|
-
&& fragment.token != vocab.special_unk_id
|
13414
|
-
&& fragment.token != vocab.special_eos_id;
|
13415
13502
|
}
|
13416
13503
|
}
|
13417
13504
|
|
@@ -13574,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
13574
13661
|
const uint32_t chr) {
|
13575
13662
|
|
13576
13663
|
bool found = false;
|
13577
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
13664
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
13578
13665
|
|
13579
13666
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
13580
13667
|
|
@@ -13583,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
13583
13670
|
// inclusive range, e.g. [a-z]
|
13584
13671
|
found = found || (pos->value <= chr && chr <= pos[1].value);
|
13585
13672
|
pos += 2;
|
13673
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
13674
|
+
// Any character matches "."
|
13675
|
+
found = true;
|
13676
|
+
pos += 1;
|
13586
13677
|
} else {
|
13587
13678
|
// exact char match, e.g. [a] or "a"
|
13588
13679
|
found = found || pos->value == chr;
|
@@ -13600,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
|
|
13600
13691
|
const llama_grammar_element * pos,
|
13601
13692
|
const llama_partial_utf8 partial_utf8) {
|
13602
13693
|
|
13603
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
13694
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
13604
13695
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
13605
13696
|
|
13606
13697
|
uint32_t partial_value = partial_utf8.value;
|
@@ -13630,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
|
|
13630
13721
|
return is_positive_char;
|
13631
13722
|
}
|
13632
13723
|
pos += 2;
|
13724
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
13725
|
+
// Any character matches "."
|
13726
|
+
return true;
|
13633
13727
|
} else {
|
13634
13728
|
// exact char match, e.g. [a] or "a"
|
13635
13729
|
if (low <= pos->value && pos->value <= high) {
|
@@ -13690,6 +13784,7 @@ static void llama_grammar_advance_stack(
|
|
13690
13784
|
}
|
13691
13785
|
case LLAMA_GRETYPE_CHAR:
|
13692
13786
|
case LLAMA_GRETYPE_CHAR_NOT:
|
13787
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
13693
13788
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
13694
13789
|
// only add the stack if it's not a duplicate of one we already have
|
13695
13790
|
new_stacks.emplace_back(stack);
|
@@ -14646,260 +14741,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
14646
14741
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
14647
14742
|
}
|
14648
14743
|
|
14649
|
-
//
|
14650
|
-
// Beam search
|
14651
|
-
//
|
14652
|
-
|
14653
|
-
struct llama_beam {
|
14654
|
-
std::vector<llama_token> tokens;
|
14655
|
-
float p; // Cumulative beam probability (renormalized relative to all beams)
|
14656
|
-
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
14657
|
-
// Sort beams by probability. In case of ties, prefer beams at eob.
|
14658
|
-
bool operator<(const llama_beam & rhs) const {
|
14659
|
-
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
14660
|
-
}
|
14661
|
-
// Shift off first n tokens and discard them.
|
14662
|
-
void shift_tokens(const size_t n) {
|
14663
|
-
if (n) {
|
14664
|
-
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
14665
|
-
tokens.resize(tokens.size() - n);
|
14666
|
-
}
|
14667
|
-
}
|
14668
|
-
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
14669
|
-
};
|
14670
|
-
|
14671
|
-
// A struct for calculating logit-related info.
|
14672
|
-
struct llama_logit_info {
|
14673
|
-
const float * const logits;
|
14674
|
-
const int n_vocab;
|
14675
|
-
const float max_l;
|
14676
|
-
const float normalizer;
|
14677
|
-
struct sum_exp {
|
14678
|
-
float max_l;
|
14679
|
-
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
14680
|
-
};
|
14681
|
-
llama_logit_info(llama_context * ctx)
|
14682
|
-
: logits(llama_get_logits(ctx))
|
14683
|
-
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
14684
|
-
, max_l(*std::max_element(logits, logits + n_vocab))
|
14685
|
-
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
14686
|
-
{ }
|
14687
|
-
llama_token_data get_token_data(const llama_token token_id) const {
|
14688
|
-
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
14689
|
-
return {token_id, logits[token_id], p};
|
14690
|
-
}
|
14691
|
-
// Return top k token_data by logit.
|
14692
|
-
std::vector<llama_token_data> top_k(size_t k) {
|
14693
|
-
std::vector<llama_token_data> min_heap; // min-heap by logit
|
14694
|
-
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
14695
|
-
min_heap.reserve(k_min);
|
14696
|
-
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
14697
|
-
min_heap.push_back(get_token_data(token_id));
|
14698
|
-
}
|
14699
|
-
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
14700
|
-
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
14701
|
-
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
14702
|
-
if (min_heap.front().logit < logits[token_id]) {
|
14703
|
-
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
14704
|
-
min_heap.back().id = token_id;
|
14705
|
-
min_heap.back().logit = logits[token_id];
|
14706
|
-
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
14707
|
-
}
|
14708
|
-
}
|
14709
|
-
return min_heap;
|
14710
|
-
}
|
14711
|
-
float probability_from_logit(float logit) const {
|
14712
|
-
return normalizer * std::exp(logit - max_l);
|
14713
|
-
}
|
14714
|
-
};
|
14715
|
-
|
14716
|
-
struct llama_beam_search_data {
|
14717
|
-
llama_context * ctx;
|
14718
|
-
size_t n_beams;
|
14719
|
-
int n_past;
|
14720
|
-
int n_predict;
|
14721
|
-
std::vector<llama_beam> beams;
|
14722
|
-
std::vector<llama_beam> next_beams;
|
14723
|
-
|
14724
|
-
// Re-calculated on each loop iteration
|
14725
|
-
size_t common_prefix_length;
|
14726
|
-
|
14727
|
-
// Used to communicate to/from callback on beams state.
|
14728
|
-
std::vector<llama_beam_view> beam_views;
|
14729
|
-
|
14730
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
14731
|
-
: ctx(ctx)
|
14732
|
-
, n_beams(n_beams)
|
14733
|
-
, n_past(n_past)
|
14734
|
-
, n_predict(n_predict)
|
14735
|
-
, beam_views(n_beams) {
|
14736
|
-
beams.reserve(n_beams);
|
14737
|
-
next_beams.reserve(n_beams);
|
14738
|
-
}
|
14739
|
-
|
14740
|
-
// Collapse beams to a single beam given by index.
|
14741
|
-
void collapse_beams(const size_t beam_idx) {
|
14742
|
-
if (0u < beam_idx) {
|
14743
|
-
std::swap(beams[0], beams[beam_idx]);
|
14744
|
-
}
|
14745
|
-
beams.resize(1);
|
14746
|
-
}
|
14747
|
-
|
14748
|
-
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
14749
|
-
// The repetitive patterns below reflect the 2 stages of heaps:
|
14750
|
-
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
14751
|
-
// * If the heap is full and a new element is found that should be included, pop the
|
14752
|
-
// least element to the back(), replace it with the new, then push it into the heap.
|
14753
|
-
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
14754
|
-
// Min-heaps use a greater-than comparator.
|
14755
|
-
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
14756
|
-
if (beam.eob) {
|
14757
|
-
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
14758
|
-
if (next_beams.size() < n_beams) {
|
14759
|
-
next_beams.push_back(std::move(beam));
|
14760
|
-
if (next_beams.size() == n_beams) {
|
14761
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
14762
|
-
}
|
14763
|
-
} else if (next_beams.front().p < beam.p) {
|
14764
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
14765
|
-
next_beams.back() = std::move(beam);
|
14766
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
14767
|
-
}
|
14768
|
-
} else {
|
14769
|
-
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
14770
|
-
if (!beam.tokens.empty()) {
|
14771
|
-
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
14772
|
-
}
|
14773
|
-
llama_logit_info logit_info(ctx);
|
14774
|
-
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
14775
|
-
|
14776
|
-
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
14777
|
-
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
14778
|
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
14779
|
-
|
14780
|
-
size_t i=0;
|
14781
|
-
if (next_beams.size() < n_beams) {
|
14782
|
-
for (; next_beams.size() < n_beams ; ++i) {
|
14783
|
-
llama_beam next_beam = beam;
|
14784
|
-
next_beam.tokens.push_back(next_tokens[i].id);
|
14785
|
-
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
14786
|
-
next_beams.push_back(std::move(next_beam));
|
14787
|
-
}
|
14788
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
14789
|
-
} else {
|
14790
|
-
for (; next_beams.front().p == 0.0f ; ++i) {
|
14791
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
14792
|
-
next_beams.back() = beam;
|
14793
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
14794
|
-
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
14795
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
14796
|
-
}
|
14797
|
-
}
|
14798
|
-
for (; i < n_beams ; ++i) {
|
14799
|
-
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
14800
|
-
if (next_beams.front().p < next_p) {
|
14801
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
14802
|
-
next_beams.back() = beam;
|
14803
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
14804
|
-
next_beams.back().p = next_p;
|
14805
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
14806
|
-
}
|
14807
|
-
}
|
14808
|
-
}
|
14809
|
-
}
|
14810
|
-
|
14811
|
-
// Find common_prefix_length based on beams.
|
14812
|
-
// Requires beams is not empty.
|
14813
|
-
size_t find_common_prefix_length() {
|
14814
|
-
size_t common_prefix_length = beams[0].tokens.size();
|
14815
|
-
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
14816
|
-
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
14817
|
-
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
14818
|
-
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
14819
|
-
common_prefix_length = j;
|
14820
|
-
break;
|
14821
|
-
}
|
14822
|
-
}
|
14823
|
-
}
|
14824
|
-
return common_prefix_length;
|
14825
|
-
}
|
14826
|
-
|
14827
|
-
// Construct beams_state to send back to caller via the callback function.
|
14828
|
-
// Side effect: set common_prefix_length = find_common_prefix_length();
|
14829
|
-
llama_beams_state get_beams_state(const bool last_call) {
|
14830
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
14831
|
-
beam_views[i] = beams[i].view();
|
14832
|
-
}
|
14833
|
-
common_prefix_length = find_common_prefix_length();
|
14834
|
-
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
14835
|
-
}
|
14836
|
-
|
14837
|
-
// Loop:
|
14838
|
-
// * while i < n_predict, AND
|
14839
|
-
// * any of the beams have not yet reached end-of-beam (eob), AND
|
14840
|
-
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
14841
|
-
// (since all other beam probabilities can only decrease)
|
14842
|
-
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
14843
|
-
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
14844
|
-
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
14845
|
-
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
14846
|
-
!beams[top_beam_index()].eob ; ++i) {
|
14847
|
-
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
14848
|
-
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
14849
|
-
if (common_prefix_length) {
|
14850
|
-
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
14851
|
-
n_past += common_prefix_length;
|
14852
|
-
}
|
14853
|
-
// Zero-out next_beam probabilities to place them last in following min-heap.
|
14854
|
-
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
14855
|
-
for (llama_beam & beam : beams) {
|
14856
|
-
beam.shift_tokens(common_prefix_length);
|
14857
|
-
fill_next_beams_by_top_probabilities(beam);
|
14858
|
-
}
|
14859
|
-
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
14860
|
-
beams.swap(next_beams);
|
14861
|
-
renormalize_beam_probabilities(beams);
|
14862
|
-
}
|
14863
|
-
collapse_beams(top_beam_index());
|
14864
|
-
callback(callback_data, get_beams_state(true));
|
14865
|
-
}
|
14866
|
-
|
14867
|
-
// As beams grow, the cumulative probabilities decrease.
|
14868
|
-
// Renormalize them to avoid floating point underflow.
|
14869
|
-
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
14870
|
-
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
14871
|
-
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
14872
|
-
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
14873
|
-
}
|
14874
|
-
|
14875
|
-
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
14876
|
-
size_t top_beam_index() {
|
14877
|
-
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
14878
|
-
}
|
14879
|
-
|
14880
|
-
// Copy (p,eob) for each beam which may have been changed by the callback.
|
14881
|
-
void update_beams_from_beam_views() {
|
14882
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
14883
|
-
beams[i].p = beam_views[i].p;
|
14884
|
-
beams[i].eob = beam_views[i].eob;
|
14885
|
-
}
|
14886
|
-
}
|
14887
|
-
};
|
14888
|
-
|
14889
|
-
void llama_beam_search(llama_context * ctx,
|
14890
|
-
llama_beam_search_callback_fn_t callback, void * callback_data,
|
14891
|
-
size_t n_beams, int n_past, int n_predict) {
|
14892
|
-
assert(ctx);
|
14893
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
14894
|
-
|
14895
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
14896
|
-
|
14897
|
-
beam_search_data.loop(callback, callback_data);
|
14898
|
-
|
14899
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
14900
|
-
ctx->n_sample++;
|
14901
|
-
}
|
14902
|
-
|
14903
14744
|
//
|
14904
14745
|
// quantization
|
14905
14746
|
//
|
@@ -15417,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
15417
15258
|
if (imatrix_data) {
|
15418
15259
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
15419
15260
|
qs.has_imatrix = true;
|
15261
|
+
// check imatrix for nans or infs
|
15262
|
+
for (const auto & kv : *imatrix_data) {
|
15263
|
+
for (float f : kv.second) {
|
15264
|
+
if (!std::isfinite(f)) {
|
15265
|
+
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
|
15266
|
+
}
|
15267
|
+
}
|
15268
|
+
}
|
15420
15269
|
}
|
15421
15270
|
}
|
15422
15271
|
|
@@ -16110,7 +15959,7 @@ bool llama_supports_mlock(void) {
|
|
16110
15959
|
}
|
16111
15960
|
|
16112
15961
|
bool llama_supports_gpu_offload(void) {
|
16113
|
-
#if defined(GGML_USE_CUDA) || defined(
|
15962
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
16114
15963
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
16115
15964
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
16116
15965
|
return true;
|
@@ -16167,7 +16016,7 @@ struct llama_model * llama_load_model_from_file(
|
|
16167
16016
|
return true;
|
16168
16017
|
};
|
16169
16018
|
}
|
16170
|
-
if (params.rpc_servers != nullptr) {
|
16019
|
+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
16171
16020
|
// split the servers set them into model->rpc_servers
|
16172
16021
|
std::string servers(params.rpc_servers);
|
16173
16022
|
size_t pos = 0;
|
@@ -16221,6 +16070,11 @@ struct llama_context * llama_new_context_with_model(
|
|
16221
16070
|
params.flash_attn = false;
|
16222
16071
|
}
|
16223
16072
|
|
16073
|
+
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
16074
|
+
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
16075
|
+
return nullptr;
|
16076
|
+
}
|
16077
|
+
|
16224
16078
|
llama_context * ctx = new llama_context(*model);
|
16225
16079
|
|
16226
16080
|
const auto & hparams = model->hparams;
|
@@ -16259,8 +16113,8 @@ struct llama_context * llama_new_context_with_model(
|
|
16259
16113
|
|
16260
16114
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
16261
16115
|
|
16262
|
-
cparams.
|
16263
|
-
hparams.
|
16116
|
+
cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
16117
|
+
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
|
16264
16118
|
hparams.n_ctx_train;
|
16265
16119
|
|
16266
16120
|
cparams.cb_eval = params.cb_eval;
|
@@ -16325,17 +16179,7 @@ struct llama_context * llama_new_context_with_model(
|
|
16325
16179
|
|
16326
16180
|
if (!hparams.vocab_only) {
|
16327
16181
|
// initialize backends
|
16328
|
-
#if defined(
|
16329
|
-
for (auto & server : model->rpc_servers) {
|
16330
|
-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
16331
|
-
if (backend == nullptr) {
|
16332
|
-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
16333
|
-
llama_free(ctx);
|
16334
|
-
return nullptr;
|
16335
|
-
}
|
16336
|
-
ctx->backends.push_back(backend);
|
16337
|
-
}
|
16338
|
-
#elif defined(GGML_USE_METAL)
|
16182
|
+
#if defined(GGML_USE_METAL)
|
16339
16183
|
if (model->n_gpu_layers > 0) {
|
16340
16184
|
ctx->backend_metal = ggml_backend_metal_init();
|
16341
16185
|
if (ctx->backend_metal == nullptr) {
|
@@ -16374,7 +16218,7 @@ struct llama_context * llama_new_context_with_model(
|
|
16374
16218
|
return nullptr;
|
16375
16219
|
}
|
16376
16220
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
16377
|
-
ggml_backend_t backend = ggml_backend_vk_init(
|
16221
|
+
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
|
16378
16222
|
if (backend == nullptr) {
|
16379
16223
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
16380
16224
|
llama_free(ctx);
|
@@ -16428,6 +16272,29 @@ struct llama_context * llama_new_context_with_model(
|
|
16428
16272
|
ctx->backends.push_back(backend);
|
16429
16273
|
}
|
16430
16274
|
#endif
|
16275
|
+
|
16276
|
+
#ifdef GGML_USE_BLAS
|
16277
|
+
ctx->backend_blas = ggml_backend_blas_init();
|
16278
|
+
if (ctx->backend_blas == nullptr) {
|
16279
|
+
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
16280
|
+
} else {
|
16281
|
+
ctx->backends.push_back(ctx->backend_blas);
|
16282
|
+
}
|
16283
|
+
#endif
|
16284
|
+
|
16285
|
+
#if defined(GGML_USE_RPC)
|
16286
|
+
if (model->n_gpu_layers > 0) {
|
16287
|
+
for (const auto & endpoint : model->rpc_servers) {
|
16288
|
+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
16289
|
+
if (backend == nullptr) {
|
16290
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
16291
|
+
llama_free(ctx);
|
16292
|
+
return nullptr;
|
16293
|
+
}
|
16294
|
+
ctx->backends.push_back(backend);
|
16295
|
+
}
|
16296
|
+
}
|
16297
|
+
#endif
|
16431
16298
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
16432
16299
|
if (ctx->backend_cpu == nullptr) {
|
16433
16300
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
@@ -18209,9 +18076,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
|
18209
18076
|
return model->vocab.id_to_token[token].score;
|
18210
18077
|
}
|
18211
18078
|
|
18212
|
-
|
18079
|
+
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
18213
18080
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
18214
|
-
return model->vocab.id_to_token[token].
|
18081
|
+
return model->vocab.id_to_token[token].attr;
|
18215
18082
|
}
|
18216
18083
|
|
18217
18084
|
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
@@ -18313,9 +18180,14 @@ static std::string llama_decode_text(const std::string & text) {
|
|
18313
18180
|
|
18314
18181
|
// does not write null-terminator to buf
|
18315
18182
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
18183
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
18184
|
+
if (!special && llama_is_control_token(model->vocab, token)) {
|
18185
|
+
return 0;
|
18186
|
+
}
|
18187
|
+
|
18316
18188
|
// if we have a cache - use it
|
18317
18189
|
{
|
18318
|
-
const auto & cache =
|
18190
|
+
const auto & cache = model->vocab.cache_token_to_piece;
|
18319
18191
|
|
18320
18192
|
if (!cache.empty()) {
|
18321
18193
|
const auto & res = cache.at(token);
|