llama_cpp 0.15.4 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -13,8 +13,6 @@
|
|
|
13
13
|
|
|
14
14
|
#ifdef GGML_USE_CUDA
|
|
15
15
|
# include "ggml-cuda.h"
|
|
16
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
17
|
-
# include "ggml-opencl.h"
|
|
18
16
|
#elif defined(GGML_USE_VULKAN)
|
|
19
17
|
# include "ggml-vulkan.h"
|
|
20
18
|
#elif defined(GGML_USE_SYCL)
|
|
@@ -23,6 +21,10 @@
|
|
|
23
21
|
# include "ggml-kompute.h"
|
|
24
22
|
#endif
|
|
25
23
|
|
|
24
|
+
#ifdef GGML_USE_BLAS
|
|
25
|
+
# include "ggml-blas.h"
|
|
26
|
+
#endif
|
|
27
|
+
|
|
26
28
|
#ifdef GGML_USE_METAL
|
|
27
29
|
# include "ggml-metal.h"
|
|
28
30
|
#endif
|
|
@@ -110,7 +112,7 @@
|
|
|
110
112
|
//
|
|
111
113
|
|
|
112
114
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
113
|
-
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
|
115
|
+
static void llama_log_internal (ggml_log_level level, const char * format, ...);
|
|
114
116
|
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
|
115
117
|
|
|
116
118
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
@@ -706,6 +708,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
706
708
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
707
709
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
708
710
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
711
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
|
709
712
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
710
713
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
711
714
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
@@ -1850,7 +1853,7 @@ struct llama_hparams {
|
|
|
1850
1853
|
float rope_attn_factor = 1.0f;
|
|
1851
1854
|
float rope_freq_base_train;
|
|
1852
1855
|
float rope_freq_scale_train;
|
|
1853
|
-
uint32_t
|
|
1856
|
+
uint32_t n_ctx_orig_yarn;
|
|
1854
1857
|
float rope_yarn_log_mul;
|
|
1855
1858
|
|
|
1856
1859
|
// for State Space Models
|
|
@@ -1892,7 +1895,7 @@ struct llama_hparams {
|
|
|
1892
1895
|
if (this->n_expert_shared != other.n_expert_shared) return true;
|
|
1893
1896
|
|
|
1894
1897
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
1895
|
-
if (this->
|
|
1898
|
+
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
|
1896
1899
|
|
|
1897
1900
|
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
|
1898
1901
|
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
|
@@ -1951,7 +1954,7 @@ struct llama_cparams {
|
|
|
1951
1954
|
float rope_freq_base;
|
|
1952
1955
|
float rope_freq_scale;
|
|
1953
1956
|
|
|
1954
|
-
uint32_t
|
|
1957
|
+
uint32_t n_ctx_orig_yarn;
|
|
1955
1958
|
// These hyperparameters are not exposed in GGUF, because all
|
|
1956
1959
|
// existing YaRN models use the same values for them.
|
|
1957
1960
|
float yarn_ext_factor;
|
|
@@ -2149,12 +2152,12 @@ struct llama_control_vector {
|
|
|
2149
2152
|
struct llama_vocab {
|
|
2150
2153
|
using id = int32_t;
|
|
2151
2154
|
using token = std::string;
|
|
2152
|
-
using
|
|
2155
|
+
using tattr = llama_token_attr;
|
|
2153
2156
|
|
|
2154
2157
|
struct token_data {
|
|
2155
2158
|
token text;
|
|
2156
2159
|
float score;
|
|
2157
|
-
|
|
2160
|
+
tattr attr;
|
|
2158
2161
|
};
|
|
2159
2162
|
|
|
2160
2163
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
@@ -2164,8 +2167,7 @@ struct llama_vocab {
|
|
|
2164
2167
|
std::vector<token_data> id_to_token;
|
|
2165
2168
|
|
|
2166
2169
|
std::vector<id> cache_special_tokens;
|
|
2167
|
-
std::vector<token> cache_token_to_piece;
|
|
2168
|
-
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
|
2170
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
|
2169
2171
|
|
|
2170
2172
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
|
2171
2173
|
|
|
@@ -2301,9 +2303,13 @@ struct llama_context {
|
|
|
2301
2303
|
std::vector<ggml_backend_t> backends;
|
|
2302
2304
|
#ifdef GGML_USE_METAL
|
|
2303
2305
|
ggml_backend_t backend_metal = nullptr;
|
|
2306
|
+
#endif
|
|
2307
|
+
#ifdef GGML_USE_BLAS
|
|
2308
|
+
ggml_backend_t backend_blas = nullptr;
|
|
2304
2309
|
#endif
|
|
2305
2310
|
ggml_backend_t backend_cpu = nullptr;
|
|
2306
2311
|
|
|
2312
|
+
|
|
2307
2313
|
const llama_model & model;
|
|
2308
2314
|
|
|
2309
2315
|
// key + value cache for the self attention
|
|
@@ -2372,13 +2378,34 @@ struct llama_context {
|
|
|
2372
2378
|
struct llama_control_vector cvec;
|
|
2373
2379
|
};
|
|
2374
2380
|
|
|
2381
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
|
2382
|
+
size_t count = 1;
|
|
2383
|
+
#if defined(GGML_USE_CUDA)
|
|
2384
|
+
count = ggml_backend_cuda_get_device_count();
|
|
2385
|
+
#elif defined(GGML_USE_SYCL)
|
|
2386
|
+
count = ggml_backend_sycl_get_device_count();
|
|
2387
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2388
|
+
count = ggml_backend_vk_get_device_count();
|
|
2389
|
+
#endif
|
|
2390
|
+
#if defined(GGML_USE_RPC)
|
|
2391
|
+
count += model.rpc_servers.size();
|
|
2392
|
+
#endif
|
|
2393
|
+
return count;
|
|
2394
|
+
GGML_UNUSED(model);
|
|
2395
|
+
}
|
|
2396
|
+
|
|
2375
2397
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
|
2376
2398
|
ggml_backend_buffer_type_t buft = nullptr;
|
|
2377
2399
|
|
|
2378
|
-
#
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2400
|
+
#if defined(GGML_USE_RPC)
|
|
2401
|
+
int dev_count = (int)llama_get_device_count(model);
|
|
2402
|
+
int rpc_count = (int)model.rpc_servers.size();
|
|
2403
|
+
if (gpu >= dev_count - rpc_count) {
|
|
2404
|
+
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
|
2405
|
+
return ggml_backend_rpc_buffer_type(endpoint);
|
|
2406
|
+
}
|
|
2407
|
+
#endif
|
|
2408
|
+
#if defined(GGML_USE_METAL)
|
|
2382
2409
|
buft = ggml_backend_metal_buffer_type();
|
|
2383
2410
|
#elif defined(GGML_USE_CUDA)
|
|
2384
2411
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
@@ -2386,8 +2413,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|
|
2386
2413
|
buft = ggml_backend_vk_buffer_type(gpu);
|
|
2387
2414
|
#elif defined(GGML_USE_SYCL)
|
|
2388
2415
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
2389
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
2390
|
-
buft = ggml_backend_opencl_buffer_type();
|
|
2391
2416
|
#elif defined(GGML_USE_KOMPUTE)
|
|
2392
2417
|
buft = ggml_backend_kompute_buffer_type(gpu);
|
|
2393
2418
|
if (buft == nullptr) {
|
|
@@ -2426,29 +2451,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|
|
2426
2451
|
GGML_UNUSED(tensor_split);
|
|
2427
2452
|
}
|
|
2428
2453
|
|
|
2429
|
-
static size_t llama_get_device_count(const llama_model & model) {
|
|
2430
|
-
#if defined(GGML_USE_RPC)
|
|
2431
|
-
return model.rpc_servers.size();
|
|
2432
|
-
#elif defined(GGML_USE_CUDA)
|
|
2433
|
-
return ggml_backend_cuda_get_device_count();
|
|
2434
|
-
#elif defined(GGML_USE_SYCL)
|
|
2435
|
-
return ggml_backend_sycl_get_device_count();
|
|
2436
|
-
#elif defined(GGML_USE_VULKAN)
|
|
2437
|
-
return ggml_backend_vk_get_device_count();
|
|
2438
|
-
#else
|
|
2439
|
-
return 1;
|
|
2440
|
-
#endif
|
|
2441
|
-
GGML_UNUSED(model);
|
|
2442
|
-
}
|
|
2443
|
-
|
|
2444
2454
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
2445
2455
|
#if defined(GGML_USE_RPC)
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2456
|
+
int dev_count = (int)llama_get_device_count(model);
|
|
2457
|
+
int rpc_count = (int)model.rpc_servers.size();
|
|
2458
|
+
if (device >= dev_count - rpc_count) {
|
|
2459
|
+
size_t total;
|
|
2460
|
+
size_t free;
|
|
2461
|
+
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
|
2462
|
+
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
|
2463
|
+
return free;
|
|
2464
|
+
}
|
|
2465
|
+
#endif
|
|
2466
|
+
#if defined(GGML_USE_CUDA)
|
|
2452
2467
|
size_t total;
|
|
2453
2468
|
size_t free;
|
|
2454
2469
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
|
@@ -2520,10 +2535,6 @@ static bool llama_kv_cache_init(
|
|
|
2520
2535
|
}
|
|
2521
2536
|
}
|
|
2522
2537
|
|
|
2523
|
-
#ifdef GGML_USE_CLBLAST
|
|
2524
|
-
offload = false;
|
|
2525
|
-
#endif
|
|
2526
|
-
|
|
2527
2538
|
// count used buffer types
|
|
2528
2539
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
|
2529
2540
|
if (offload) {
|
|
@@ -4003,8 +4014,8 @@ static void llm_load_hparams(
|
|
|
4003
4014
|
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
4004
4015
|
hparams.rope_finetuned = rope_finetuned;
|
|
4005
4016
|
|
|
4006
|
-
hparams.
|
|
4007
|
-
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.
|
|
4017
|
+
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
|
4018
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
|
4008
4019
|
|
|
4009
4020
|
// rope_freq_base (optional)
|
|
4010
4021
|
hparams.rope_freq_base_train = 10000.0f;
|
|
@@ -4550,35 +4561,6 @@ static void llm_load_vocab(
|
|
|
4550
4561
|
vocab.special_cls_id = -1;
|
|
4551
4562
|
vocab.special_mask_id = -1;
|
|
4552
4563
|
|
|
4553
|
-
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
4554
|
-
// prior to support of FIM special tokens in GGUF, the following
|
|
4555
|
-
// will allow those models to continue to work. The general names
|
|
4556
|
-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
|
4557
|
-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
|
4558
|
-
// new versions of these models have been published.
|
|
4559
|
-
std::string gen_name;
|
|
4560
|
-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
|
4561
|
-
|
|
4562
|
-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
|
4563
|
-
[](unsigned char c){ return std::tolower(c); });
|
|
4564
|
-
|
|
4565
|
-
if (gen_name.find("code") != std::string::npos) {
|
|
4566
|
-
if (model.arch == LLM_ARCH_LLAMA) {
|
|
4567
|
-
vocab.special_prefix_id = 32007;
|
|
4568
|
-
vocab.special_suffix_id = 32008;
|
|
4569
|
-
vocab.special_middle_id = 32009;
|
|
4570
|
-
vocab.special_eot_id = 32010;
|
|
4571
|
-
} else if (model.arch == LLM_ARCH_GEMMA) {
|
|
4572
|
-
vocab.special_prefix_id = 67;
|
|
4573
|
-
vocab.special_suffix_id = 69;
|
|
4574
|
-
vocab.special_middle_id = 68;
|
|
4575
|
-
// TODO: this is not EOT, it is "file separator" token, needs fix
|
|
4576
|
-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
|
4577
|
-
//vocab.special_eot_id = 70;
|
|
4578
|
-
vocab.special_eot_id = 107;
|
|
4579
|
-
}
|
|
4580
|
-
}
|
|
4581
|
-
|
|
4582
4564
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
|
4583
4565
|
if (add_space_prefix_keyidx != -1) {
|
|
4584
4566
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
@@ -4651,8 +4633,7 @@ static void llm_load_vocab(
|
|
|
4651
4633
|
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
|
4652
4634
|
LLAMA_LOG_WARN("%s: \n", __func__);
|
|
4653
4635
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4654
|
-
} else if (
|
|
4655
|
-
tokenizer_pre == "default") {
|
|
4636
|
+
} else if (tokenizer_pre == "default") {
|
|
4656
4637
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4657
4638
|
} else if (
|
|
4658
4639
|
tokenizer_pre == "llama3" ||
|
|
@@ -4679,7 +4660,8 @@ static void llm_load_vocab(
|
|
|
4679
4660
|
tokenizer_pre == "jina-es" ||
|
|
4680
4661
|
tokenizer_pre == "jina-de" ||
|
|
4681
4662
|
tokenizer_pre == "jina-v2-es" ||
|
|
4682
|
-
tokenizer_pre == "jina-v2-de"
|
|
4663
|
+
tokenizer_pre == "jina-v2-de" ||
|
|
4664
|
+
tokenizer_pre == "jina-v2-code") {
|
|
4683
4665
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
4684
4666
|
} else if (
|
|
4685
4667
|
tokenizer_pre == "refact") {
|
|
@@ -4702,6 +4684,9 @@ static void llm_load_vocab(
|
|
|
4702
4684
|
} else if (
|
|
4703
4685
|
tokenizer_pre == "smaug-bpe") {
|
|
4704
4686
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
|
4687
|
+
} else if (
|
|
4688
|
+
tokenizer_pre == "poro-chat") {
|
|
4689
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
|
4705
4690
|
} else {
|
|
4706
4691
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
4707
4692
|
}
|
|
@@ -4740,12 +4725,64 @@ static void llm_load_vocab(
|
|
|
4740
4725
|
auto & token_data = vocab.id_to_token[i];
|
|
4741
4726
|
token_data.text = std::move(word);
|
|
4742
4727
|
token_data.score = scores ? scores[i] : 0.0f;
|
|
4743
|
-
token_data.
|
|
4728
|
+
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
|
4729
|
+
|
|
4730
|
+
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
|
4731
|
+
switch(toktypes[i]) {
|
|
4732
|
+
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
|
4733
|
+
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
|
4734
|
+
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
|
4735
|
+
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
|
4736
|
+
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
|
4737
|
+
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
|
4738
|
+
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
4739
|
+
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
4740
|
+
}
|
|
4741
|
+
}
|
|
4744
4742
|
}
|
|
4745
4743
|
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
|
4746
4744
|
|
|
4747
4745
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
|
4748
4746
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
|
4747
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
4748
|
+
// prior to support of FIM special tokens in GGUF, the following
|
|
4749
|
+
// will allow those models to continue to work. The general names
|
|
4750
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
|
4751
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
|
4752
|
+
// new versions of these models have been published.
|
|
4753
|
+
std::string gen_name;
|
|
4754
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
|
4755
|
+
|
|
4756
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
|
4757
|
+
[](unsigned char c){ return std::tolower(c); });
|
|
4758
|
+
|
|
4759
|
+
if (gen_name.find("code") != std::string::npos) {
|
|
4760
|
+
if (model.arch == LLM_ARCH_LLAMA
|
|
4761
|
+
&& 32010 < vocab.id_to_token.size()
|
|
4762
|
+
&& vocab.id_to_token[32007].text == "<PRE>"
|
|
4763
|
+
&& vocab.id_to_token[32008].text == "<SUF>"
|
|
4764
|
+
&& vocab.id_to_token[32009].text == "<MID>"
|
|
4765
|
+
&& vocab.id_to_token[32010].text == "<EOT>") {
|
|
4766
|
+
vocab.special_prefix_id = 32007;
|
|
4767
|
+
vocab.special_suffix_id = 32008;
|
|
4768
|
+
vocab.special_middle_id = 32009;
|
|
4769
|
+
vocab.special_eot_id = 32010;
|
|
4770
|
+
} else if (model.arch == LLM_ARCH_GEMMA
|
|
4771
|
+
&& 107 < vocab.id_to_token.size()
|
|
4772
|
+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
|
4773
|
+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
|
4774
|
+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
|
4775
|
+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
|
4776
|
+
vocab.special_prefix_id = 67;
|
|
4777
|
+
vocab.special_suffix_id = 69;
|
|
4778
|
+
vocab.special_middle_id = 68;
|
|
4779
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
|
4780
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
|
4781
|
+
//vocab.special_eot_id = 70;
|
|
4782
|
+
vocab.special_eot_id = 107;
|
|
4783
|
+
}
|
|
4784
|
+
}
|
|
4785
|
+
|
|
4749
4786
|
try {
|
|
4750
4787
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
|
4751
4788
|
} catch (const std::exception & e) {
|
|
@@ -4831,7 +4868,7 @@ static void llm_load_vocab(
|
|
|
4831
4868
|
// build special tokens cache
|
|
4832
4869
|
{
|
|
4833
4870
|
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
|
4834
|
-
if (vocab.id_to_token[id].
|
|
4871
|
+
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
|
4835
4872
|
vocab.cache_special_tokens.push_back(id);
|
|
4836
4873
|
}
|
|
4837
4874
|
}
|
|
@@ -4845,26 +4882,75 @@ static void llm_load_vocab(
|
|
|
4845
4882
|
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
|
4846
4883
|
}
|
|
4847
4884
|
|
|
4848
|
-
// build token to piece
|
|
4885
|
+
// build token to piece cache
|
|
4849
4886
|
{
|
|
4850
4887
|
size_t size_cache = 0;
|
|
4851
4888
|
|
|
4852
|
-
std::vector<llama_vocab::token> cache_token_to_piece
|
|
4853
|
-
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
|
4889
|
+
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
|
|
4854
4890
|
|
|
4855
4891
|
for (uint32_t id = 0; id < n_vocab; ++id) {
|
|
4856
|
-
cache_token_to_piece[id]
|
|
4857
|
-
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
|
4892
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
|
|
4858
4893
|
|
|
4859
4894
|
size_cache += cache_token_to_piece[id].size();
|
|
4860
|
-
size_cache += cache_token_to_piece_special[id].size();
|
|
4861
4895
|
}
|
|
4862
4896
|
|
|
4863
|
-
std::swap(vocab.cache_token_to_piece,
|
|
4864
|
-
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
|
4897
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
|
4865
4898
|
|
|
4866
4899
|
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
|
4867
4900
|
}
|
|
4901
|
+
|
|
4902
|
+
// Handle per token attributes
|
|
4903
|
+
//NOTE: Each model customizes per token attributes.
|
|
4904
|
+
//NOTE: Per token attributes are missing from the GGUF file.
|
|
4905
|
+
//TODO: Extract attributes from GGUF file.
|
|
4906
|
+
{
|
|
4907
|
+
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
|
4908
|
+
for (auto substr : substrs) {
|
|
4909
|
+
if (str.find(substr) < std::string::npos) {
|
|
4910
|
+
return true;
|
|
4911
|
+
}
|
|
4912
|
+
}
|
|
4913
|
+
return false;
|
|
4914
|
+
};
|
|
4915
|
+
|
|
4916
|
+
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
|
4917
|
+
uint32_t current = vocab.id_to_token.at(id).attr;
|
|
4918
|
+
current = value ? (current | attr) : (current & ~attr);
|
|
4919
|
+
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
|
4920
|
+
};
|
|
4921
|
+
|
|
4922
|
+
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
|
4923
|
+
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
|
4924
|
+
};
|
|
4925
|
+
|
|
4926
|
+
std::string model_name;
|
|
4927
|
+
std::string tokenizer_pre;
|
|
4928
|
+
|
|
4929
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
|
4930
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
4931
|
+
|
|
4932
|
+
// model name to lowercase
|
|
4933
|
+
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
|
4934
|
+
[] (const std::string::value_type x) {
|
|
4935
|
+
return std::tolower(x);
|
|
4936
|
+
}
|
|
4937
|
+
);
|
|
4938
|
+
|
|
4939
|
+
// set attributes by model/tokenizer name
|
|
4940
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
|
4941
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
4942
|
+
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
4943
|
+
for (auto id : vocab.cache_special_tokens) {
|
|
4944
|
+
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
4945
|
+
}
|
|
4946
|
+
for (auto token : {"</s>"}) {
|
|
4947
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
4948
|
+
}
|
|
4949
|
+
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
|
4950
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
|
4951
|
+
}
|
|
4952
|
+
}
|
|
4953
|
+
}
|
|
4868
4954
|
}
|
|
4869
4955
|
|
|
4870
4956
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
@@ -4904,7 +4990,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
4904
4990
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
4905
4991
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
4906
4992
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4907
|
-
LLAMA_LOG_INFO("%s:
|
|
4993
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
4908
4994
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
4909
4995
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
4910
4996
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
@@ -5129,12 +5215,10 @@ static bool llm_load_tensors(
|
|
|
5129
5215
|
// output
|
|
5130
5216
|
{
|
|
5131
5217
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5132
|
-
|
|
5133
|
-
|
|
5134
|
-
|
|
5135
|
-
|
|
5136
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5137
|
-
}
|
|
5218
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5219
|
+
// if output is NULL, init from the input tok embed
|
|
5220
|
+
if (model.output == NULL) {
|
|
5221
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5138
5222
|
}
|
|
5139
5223
|
}
|
|
5140
5224
|
|
|
@@ -5453,7 +5537,7 @@ static bool llm_load_tensors(
|
|
|
5453
5537
|
|
|
5454
5538
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
5455
5539
|
} else {
|
|
5456
|
-
layer.ffn_gate
|
|
5540
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5457
5541
|
}
|
|
5458
5542
|
|
|
5459
5543
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
@@ -5494,6 +5578,9 @@ static bool llm_load_tensors(
|
|
|
5494
5578
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
|
5495
5579
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
|
5496
5580
|
|
|
5581
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5582
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5583
|
+
|
|
5497
5584
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5498
5585
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5499
5586
|
|
|
@@ -7072,7 +7159,7 @@ struct llm_build_context {
|
|
|
7072
7159
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
|
7073
7160
|
const int32_t n_outputs;
|
|
7074
7161
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
|
7075
|
-
const int32_t
|
|
7162
|
+
const int32_t n_ctx_orig;
|
|
7076
7163
|
|
|
7077
7164
|
const bool flash_attn;
|
|
7078
7165
|
|
|
@@ -7121,7 +7208,7 @@ struct llm_build_context {
|
|
|
7121
7208
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
|
7122
7209
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
|
7123
7210
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
|
7124
|
-
|
|
7211
|
+
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
|
7125
7212
|
flash_attn (cparams.flash_attn),
|
|
7126
7213
|
pooling_type (cparams.pooling_type),
|
|
7127
7214
|
rope_type (hparams.rope_type),
|
|
@@ -7179,7 +7266,7 @@ struct llm_build_context {
|
|
|
7179
7266
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
|
7180
7267
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
7181
7268
|
0),
|
|
7182
|
-
lctx.inp_K_shift, rope_factors, n_rot, rope_type,
|
|
7269
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7183
7270
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7184
7271
|
|
|
7185
7272
|
cb(tmp, "K_shifted", il);
|
|
@@ -7288,7 +7375,7 @@ struct llm_build_context {
|
|
|
7288
7375
|
// choose long/short freq factors based on the context size
|
|
7289
7376
|
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
7290
7377
|
|
|
7291
|
-
if (n_ctx_pre_seq > hparams.
|
|
7378
|
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
|
7292
7379
|
return model.layers[il].rope_long;
|
|
7293
7380
|
}
|
|
7294
7381
|
|
|
@@ -7404,14 +7491,14 @@ struct llm_build_context {
|
|
|
7404
7491
|
|
|
7405
7492
|
Qcur = ggml_rope_ext(
|
|
7406
7493
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7407
|
-
n_rot, rope_type,
|
|
7494
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7408
7495
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7409
7496
|
);
|
|
7410
7497
|
cb(Qcur, "Qcur", il);
|
|
7411
7498
|
|
|
7412
7499
|
Kcur = ggml_rope_ext(
|
|
7413
7500
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7414
|
-
n_rot, rope_type,
|
|
7501
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7415
7502
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7416
7503
|
);
|
|
7417
7504
|
cb(Kcur, "Kcur", il);
|
|
@@ -7535,12 +7622,12 @@ struct llm_build_context {
|
|
|
7535
7622
|
case MODEL_7B:
|
|
7536
7623
|
Qcur = ggml_rope_ext(
|
|
7537
7624
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7538
|
-
n_rot, rope_type,
|
|
7625
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7539
7626
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7540
7627
|
);
|
|
7541
7628
|
Kcur = ggml_rope_ext(
|
|
7542
7629
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7543
|
-
n_rot, rope_type,
|
|
7630
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7544
7631
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7545
7632
|
);
|
|
7546
7633
|
break;
|
|
@@ -7647,14 +7734,14 @@ struct llm_build_context {
|
|
|
7647
7734
|
|
|
7648
7735
|
Qcur = ggml_rope_ext(
|
|
7649
7736
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7650
|
-
n_rot, rope_type,
|
|
7737
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7651
7738
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7652
7739
|
);
|
|
7653
7740
|
cb(Qcur, "Qcur", il);
|
|
7654
7741
|
|
|
7655
7742
|
Kcur = ggml_rope_ext(
|
|
7656
7743
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7657
|
-
n_rot, rope_type,
|
|
7744
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7658
7745
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7659
7746
|
);
|
|
7660
7747
|
cb(Kcur, "Kcur", il);
|
|
@@ -7767,13 +7854,13 @@ struct llm_build_context {
|
|
|
7767
7854
|
|
|
7768
7855
|
// using mode = 2 for neox mode
|
|
7769
7856
|
Qcur = ggml_rope_ext(
|
|
7770
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
7857
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
7771
7858
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7772
7859
|
);
|
|
7773
7860
|
cb(Qcur, "Qcur", il);
|
|
7774
7861
|
|
|
7775
7862
|
Kcur = ggml_rope_ext(
|
|
7776
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
7863
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
7777
7864
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7778
7865
|
);
|
|
7779
7866
|
cb(Kcur, "Kcur", il);
|
|
@@ -7891,14 +7978,14 @@ struct llm_build_context {
|
|
|
7891
7978
|
|
|
7892
7979
|
Qcur = ggml_rope_ext(
|
|
7893
7980
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7894
|
-
n_rot, rope_type,
|
|
7981
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7895
7982
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7896
7983
|
);
|
|
7897
7984
|
cb(Qcur, "Qcur", il);
|
|
7898
7985
|
|
|
7899
7986
|
Kcur = ggml_rope_ext(
|
|
7900
7987
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7901
|
-
n_rot, rope_type,
|
|
7988
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7902
7989
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7903
7990
|
);
|
|
7904
7991
|
cb(Kcur, "Kcur", il);
|
|
@@ -8044,14 +8131,14 @@ struct llm_build_context {
|
|
|
8044
8131
|
|
|
8045
8132
|
Qcur = ggml_rope_ext(
|
|
8046
8133
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8047
|
-
n_rot, rope_type,
|
|
8134
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8048
8135
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8049
8136
|
);
|
|
8050
8137
|
cb(Qcur, "Qcur", il);
|
|
8051
8138
|
|
|
8052
8139
|
Kcur = ggml_rope_ext(
|
|
8053
8140
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8054
|
-
n_rot, rope_type,
|
|
8141
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8055
8142
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8056
8143
|
);
|
|
8057
8144
|
cb(Kcur, "Kcur", il);
|
|
@@ -8398,14 +8485,14 @@ struct llm_build_context {
|
|
|
8398
8485
|
|
|
8399
8486
|
Qcur = ggml_rope_ext(
|
|
8400
8487
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8401
|
-
n_rot, rope_type,
|
|
8488
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8402
8489
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8403
8490
|
);
|
|
8404
8491
|
cb(Qcur, "Qcur", il);
|
|
8405
8492
|
|
|
8406
8493
|
Kcur = ggml_rope_ext(
|
|
8407
8494
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8408
|
-
n_rot, rope_type,
|
|
8495
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8409
8496
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8410
8497
|
);
|
|
8411
8498
|
cb(Kcur, "Kcur", il);
|
|
@@ -8457,6 +8544,11 @@ struct llm_build_context {
|
|
|
8457
8544
|
// attention layer norm
|
|
8458
8545
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
|
8459
8546
|
|
|
8547
|
+
if (model.layers[il].attn_norm_2 != nullptr) {
|
|
8548
|
+
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
|
|
8549
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
|
|
8550
|
+
}
|
|
8551
|
+
|
|
8460
8552
|
struct ggml_tensor * ffn_inp = cur;
|
|
8461
8553
|
cb(ffn_inp, "ffn_inp", il);
|
|
8462
8554
|
|
|
@@ -8838,14 +8930,14 @@ struct llm_build_context {
|
|
|
8838
8930
|
|
|
8839
8931
|
Qcur = ggml_rope_ext(
|
|
8840
8932
|
ctx0, Qcur, inp_pos, nullptr,
|
|
8841
|
-
n_rot, rope_type,
|
|
8933
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8842
8934
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8843
8935
|
);
|
|
8844
8936
|
cb(Qcur, "Qcur", il);
|
|
8845
8937
|
|
|
8846
8938
|
Kcur = ggml_rope_ext(
|
|
8847
8939
|
ctx0, Kcur, inp_pos, nullptr,
|
|
8848
|
-
n_rot, rope_type,
|
|
8940
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8849
8941
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8850
8942
|
);
|
|
8851
8943
|
cb(Kcur, "Kcur", il);
|
|
@@ -8957,13 +9049,13 @@ struct llm_build_context {
|
|
|
8957
9049
|
|
|
8958
9050
|
// using mode = 2 for neox mode
|
|
8959
9051
|
Qcur = ggml_rope_ext(
|
|
8960
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9052
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
8961
9053
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8962
9054
|
);
|
|
8963
9055
|
cb(Qcur, "Qcur", il);
|
|
8964
9056
|
|
|
8965
9057
|
Kcur = ggml_rope_ext(
|
|
8966
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9058
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
8967
9059
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8968
9060
|
);
|
|
8969
9061
|
cb(Kcur, "Kcur", il);
|
|
@@ -9069,14 +9161,14 @@ struct llm_build_context {
|
|
|
9069
9161
|
|
|
9070
9162
|
Qcur = ggml_rope_ext(
|
|
9071
9163
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9072
|
-
n_rot, rope_type,
|
|
9164
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9073
9165
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9074
9166
|
);
|
|
9075
9167
|
cb(Qcur, "Qcur", il);
|
|
9076
9168
|
|
|
9077
9169
|
Kcur = ggml_rope_ext(
|
|
9078
9170
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9079
|
-
n_rot, rope_type,
|
|
9171
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9080
9172
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9081
9173
|
);
|
|
9082
9174
|
cb(Kcur, "Kcur", il);
|
|
@@ -9183,14 +9275,14 @@ struct llm_build_context {
|
|
|
9183
9275
|
|
|
9184
9276
|
Qcur = ggml_rope_ext(
|
|
9185
9277
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9186
|
-
n_rot, rope_type,
|
|
9278
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9187
9279
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9188
9280
|
);
|
|
9189
9281
|
cb(Qcur, "Qcur", il);
|
|
9190
9282
|
|
|
9191
9283
|
Kcur = ggml_rope_ext(
|
|
9192
9284
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9193
|
-
n_rot, rope_type,
|
|
9285
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9194
9286
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9195
9287
|
);
|
|
9196
9288
|
cb(Kcur, "Kcur", il);
|
|
@@ -9335,7 +9427,7 @@ struct llm_build_context {
|
|
|
9335
9427
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9336
9428
|
|
|
9337
9429
|
Qcur = ggml_rope_ext(
|
|
9338
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9430
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
9339
9431
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9340
9432
|
);
|
|
9341
9433
|
cb(Qcur, "Qcur", il);
|
|
@@ -9346,7 +9438,7 @@ struct llm_build_context {
|
|
|
9346
9438
|
cb(Qcur, "Qcur", il);
|
|
9347
9439
|
|
|
9348
9440
|
Kcur = ggml_rope_ext(
|
|
9349
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9441
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
9350
9442
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9351
9443
|
);
|
|
9352
9444
|
cb(Kcur, "Kcur", il);
|
|
@@ -9457,7 +9549,7 @@ struct llm_build_context {
|
|
|
9457
9549
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9458
9550
|
|
|
9459
9551
|
Qcur = ggml_rope_ext(
|
|
9460
|
-
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type,
|
|
9552
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
|
9461
9553
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9462
9554
|
);
|
|
9463
9555
|
cb(Qcur, "Qcur", il);
|
|
@@ -9466,7 +9558,7 @@ struct llm_build_context {
|
|
|
9466
9558
|
cb(Qcur, "Qcur", il);
|
|
9467
9559
|
|
|
9468
9560
|
Kcur = ggml_rope_ext(
|
|
9469
|
-
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type,
|
|
9561
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
|
9470
9562
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9471
9563
|
);
|
|
9472
9564
|
cb(Kcur, "Kcur", il);
|
|
@@ -9574,13 +9666,13 @@ struct llm_build_context {
|
|
|
9574
9666
|
|
|
9575
9667
|
Qcur = ggml_rope_ext(
|
|
9576
9668
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
|
9577
|
-
n_embd_head, rope_type,
|
|
9669
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9578
9670
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9579
9671
|
cb(Qcur, "Qcur", il);
|
|
9580
9672
|
|
|
9581
9673
|
Kcur = ggml_rope_ext(
|
|
9582
9674
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9583
|
-
n_embd_head, rope_type,
|
|
9675
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9584
9676
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9585
9677
|
cb(Kcur, "Kcur", il);
|
|
9586
9678
|
|
|
@@ -9782,14 +9874,14 @@ struct llm_build_context {
|
|
|
9782
9874
|
|
|
9783
9875
|
struct ggml_tensor * Qcur = ggml_rope_ext(
|
|
9784
9876
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9785
|
-
n_rot, rope_type,
|
|
9877
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9786
9878
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9787
9879
|
);
|
|
9788
9880
|
cb(Qcur, "Qcur", il);
|
|
9789
9881
|
|
|
9790
9882
|
struct ggml_tensor * Kcur = ggml_rope_ext(
|
|
9791
9883
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9792
|
-
n_rot, rope_type,
|
|
9884
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9793
9885
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9794
9886
|
);
|
|
9795
9887
|
cb(Kcur, "Kcur", il);
|
|
@@ -9898,14 +9990,14 @@ struct llm_build_context {
|
|
|
9898
9990
|
|
|
9899
9991
|
Qcur = ggml_rope_ext(
|
|
9900
9992
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9901
|
-
n_rot, rope_type,
|
|
9993
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9902
9994
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9903
9995
|
);
|
|
9904
9996
|
cb(Qcur, "Qcur", il);
|
|
9905
9997
|
|
|
9906
9998
|
Kcur = ggml_rope_ext(
|
|
9907
9999
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9908
|
-
n_rot, rope_type,
|
|
10000
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9909
10001
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9910
10002
|
);
|
|
9911
10003
|
cb(Kcur, "Kcur", il);
|
|
@@ -10015,14 +10107,14 @@ struct llm_build_context {
|
|
|
10015
10107
|
|
|
10016
10108
|
Qcur = ggml_rope_ext(
|
|
10017
10109
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10018
|
-
n_rot, rope_type,
|
|
10110
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10019
10111
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10020
10112
|
);
|
|
10021
10113
|
cb(Qcur, "Qcur", il);
|
|
10022
10114
|
|
|
10023
10115
|
Kcur = ggml_rope_ext(
|
|
10024
10116
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10025
|
-
n_rot, rope_type,
|
|
10117
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10026
10118
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10027
10119
|
);
|
|
10028
10120
|
cb(Kcur, "Kcur", il);
|
|
@@ -10145,14 +10237,14 @@ struct llm_build_context {
|
|
|
10145
10237
|
|
|
10146
10238
|
Qcur = ggml_rope_ext(
|
|
10147
10239
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10148
|
-
n_rot, rope_type,
|
|
10240
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10149
10241
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10150
10242
|
);
|
|
10151
10243
|
cb(Qcur, "Qcur", il);
|
|
10152
10244
|
|
|
10153
10245
|
Kcur = ggml_rope_ext(
|
|
10154
10246
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10155
|
-
n_rot, rope_type,
|
|
10247
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10156
10248
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10157
10249
|
);
|
|
10158
10250
|
cb(Kcur, "Kcur", il);
|
|
@@ -10217,7 +10309,7 @@ struct llm_build_context {
|
|
|
10217
10309
|
cb(cur, "lmhead_scaling", -1);
|
|
10218
10310
|
|
|
10219
10311
|
// lm_head
|
|
10220
|
-
cur = ggml_mul_mat(ctx0, model.
|
|
10312
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
10221
10313
|
cb(cur, "result_output", -1);
|
|
10222
10314
|
|
|
10223
10315
|
ggml_build_forward_expand(gf, cur);
|
|
@@ -10265,7 +10357,7 @@ struct llm_build_context {
|
|
|
10265
10357
|
|
|
10266
10358
|
Qcur = ggml_rope_ext(
|
|
10267
10359
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
|
10268
|
-
n_embd_head_k, rope_type,
|
|
10360
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10269
10361
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10270
10362
|
cb(Qcur, "Qcur", il);
|
|
10271
10363
|
|
|
@@ -10274,7 +10366,7 @@ struct llm_build_context {
|
|
|
10274
10366
|
|
|
10275
10367
|
Kcur = ggml_rope_ext(
|
|
10276
10368
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10277
|
-
n_embd_head_k, rope_type,
|
|
10369
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10278
10370
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10279
10371
|
cb(Kcur, "Kcur", il);
|
|
10280
10372
|
|
|
@@ -10385,14 +10477,14 @@ struct llm_build_context {
|
|
|
10385
10477
|
|
|
10386
10478
|
Qcur = ggml_rope_ext(
|
|
10387
10479
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10388
|
-
n_rot, rope_type,
|
|
10480
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10389
10481
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10390
10482
|
);
|
|
10391
10483
|
cb(Qcur, "Qcur", il);
|
|
10392
10484
|
|
|
10393
10485
|
Kcur = ggml_rope_ext(
|
|
10394
10486
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10395
|
-
n_rot, rope_type,
|
|
10487
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10396
10488
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10397
10489
|
);
|
|
10398
10490
|
cb(Kcur, "Kcur", il);
|
|
@@ -10675,14 +10767,14 @@ struct llm_build_context {
|
|
|
10675
10767
|
|
|
10676
10768
|
Qcur = ggml_rope_ext(
|
|
10677
10769
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10678
|
-
n_rot, rope_type,
|
|
10770
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10679
10771
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10680
10772
|
);
|
|
10681
10773
|
cb(Qcur, "Qcur", il);
|
|
10682
10774
|
|
|
10683
10775
|
Kcur = ggml_rope_ext(
|
|
10684
10776
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10685
|
-
n_rot, rope_type,
|
|
10777
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10686
10778
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10687
10779
|
);
|
|
10688
10780
|
cb(Kcur, "Kcur", il);
|
|
@@ -10806,14 +10898,14 @@ struct llm_build_context {
|
|
|
10806
10898
|
|
|
10807
10899
|
Qcur = ggml_rope_ext(
|
|
10808
10900
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10809
|
-
n_rot, rope_type,
|
|
10901
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10810
10902
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10811
10903
|
);
|
|
10812
10904
|
cb(Qcur, "Qcur", il);
|
|
10813
10905
|
|
|
10814
10906
|
Kcur = ggml_rope_ext(
|
|
10815
10907
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10816
|
-
n_rot, rope_type,
|
|
10908
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10817
10909
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10818
10910
|
);
|
|
10819
10911
|
cb(Kcur, "Kcur", il);
|
|
@@ -10920,14 +11012,14 @@ struct llm_build_context {
|
|
|
10920
11012
|
|
|
10921
11013
|
Qcur = ggml_rope_ext(
|
|
10922
11014
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10923
|
-
n_rot, rope_type,
|
|
11015
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10924
11016
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10925
11017
|
);
|
|
10926
11018
|
cb(Qcur, "Qcur", il);
|
|
10927
11019
|
|
|
10928
11020
|
Kcur = ggml_rope_ext(
|
|
10929
11021
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10930
|
-
n_rot, rope_type,
|
|
11022
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10931
11023
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10932
11024
|
);
|
|
10933
11025
|
cb(Kcur, "Kcur", il);
|
|
@@ -11055,14 +11147,14 @@ struct llm_build_context {
|
|
|
11055
11147
|
|
|
11056
11148
|
Qcur = ggml_rope_ext(
|
|
11057
11149
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
11058
|
-
n_rot, rope_type,
|
|
11150
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11059
11151
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11060
11152
|
);
|
|
11061
11153
|
cb(Qcur, "Qcur", il);
|
|
11062
11154
|
|
|
11063
11155
|
Kcur = ggml_rope_ext(
|
|
11064
11156
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
11065
|
-
n_rot, rope_type,
|
|
11157
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11066
11158
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11067
11159
|
);
|
|
11068
11160
|
cb(Kcur, "Kcur", il);
|
|
@@ -11272,7 +11364,7 @@ struct llm_build_context {
|
|
|
11272
11364
|
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
|
11273
11365
|
q_pe = ggml_rope_ext(
|
|
11274
11366
|
ctx0, q_pe, inp_pos, nullptr,
|
|
11275
|
-
n_rot, rope_type,
|
|
11367
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11276
11368
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
11277
11369
|
);
|
|
11278
11370
|
cb(q_pe, "q_pe", il);
|
|
@@ -11281,7 +11373,7 @@ struct llm_build_context {
|
|
|
11281
11373
|
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
|
11282
11374
|
k_pe = ggml_rope_ext(
|
|
11283
11375
|
ctx0, k_pe, inp_pos, nullptr,
|
|
11284
|
-
n_rot, rope_type,
|
|
11376
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11285
11377
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
11286
11378
|
);
|
|
11287
11379
|
cb(k_pe, "k_pe", il);
|
|
@@ -11458,7 +11550,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
11458
11550
|
if (batch.n_tokens < 32 || full_offload) {
|
|
11459
11551
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
|
11460
11552
|
for (auto * backend : lctx.backends) {
|
|
11461
|
-
if (
|
|
11553
|
+
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
|
11554
|
+
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
|
11462
11555
|
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
|
11463
11556
|
break;
|
|
11464
11557
|
}
|
|
@@ -11955,6 +12048,11 @@ static void llama_graph_compute(
|
|
|
11955
12048
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
11956
12049
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
|
11957
12050
|
}
|
|
12051
|
+
#ifdef GGML_USE_BLAS
|
|
12052
|
+
if (lctx.backend_blas != nullptr) {
|
|
12053
|
+
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
|
|
12054
|
+
}
|
|
12055
|
+
#endif
|
|
11958
12056
|
|
|
11959
12057
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
|
11960
12058
|
|
|
@@ -12177,17 +12275,6 @@ static int llama_decode_internal(
|
|
|
12177
12275
|
}
|
|
12178
12276
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
12179
12277
|
|
|
12180
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
12181
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
|
12182
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
|
12183
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
|
12184
|
-
// with the BLAS calls. need a better solution
|
|
12185
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
|
12186
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
|
12187
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
|
12188
|
-
n_threads = std::min(4, n_threads);
|
|
12189
|
-
}
|
|
12190
|
-
|
|
12191
12278
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
|
12192
12279
|
|
|
12193
12280
|
llama_set_inputs(lctx, u_batch);
|
|
@@ -12616,27 +12703,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
|
12616
12703
|
|
|
12617
12704
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
|
12618
12705
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12619
|
-
return vocab.id_to_token[id].
|
|
12706
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
|
12620
12707
|
}
|
|
12621
12708
|
|
|
12622
12709
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
|
12623
12710
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12624
|
-
return vocab.id_to_token[id].
|
|
12711
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
|
12625
12712
|
}
|
|
12626
12713
|
|
|
12627
12714
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
12628
12715
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12629
|
-
return vocab.id_to_token[id].
|
|
12716
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
|
12630
12717
|
}
|
|
12631
12718
|
|
|
12632
12719
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
12633
12720
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12634
|
-
return vocab.id_to_token[id].
|
|
12721
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
|
12635
12722
|
}
|
|
12636
12723
|
|
|
12637
12724
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
|
12638
12725
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12639
|
-
return vocab.id_to_token[id].
|
|
12726
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
12640
12727
|
}
|
|
12641
12728
|
|
|
12642
12729
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
@@ -12954,6 +13041,11 @@ struct llm_tokenizer_bpe {
|
|
|
12954
13041
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12955
13042
|
});
|
|
12956
13043
|
break;
|
|
13044
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
|
13045
|
+
word_collection = unicode_regex_split(text, {
|
|
13046
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
|
13047
|
+
});
|
|
13048
|
+
break;
|
|
12957
13049
|
default:
|
|
12958
13050
|
// default regex for BPE tokenization pre-processing
|
|
12959
13051
|
word_collection = unicode_regex_split(text, {
|
|
@@ -13254,7 +13346,8 @@ struct fragment_buffer_variant {
|
|
|
13254
13346
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
|
13255
13347
|
// for each special token
|
|
13256
13348
|
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
|
13257
|
-
const auto &
|
|
13349
|
+
const auto & data = vocab.id_to_token[special_id];
|
|
13350
|
+
const auto & special_token = data.text;
|
|
13258
13351
|
|
|
13259
13352
|
// for each text fragment
|
|
13260
13353
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
|
@@ -13291,13 +13384,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
13291
13384
|
if (match > raw_text_base_offset) {
|
|
13292
13385
|
// left
|
|
13293
13386
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
|
13294
|
-
|
|
13295
|
-
|
|
13387
|
+
int64_t left_reminder_length = match - raw_text_base_offset;
|
|
13388
|
+
|
|
13389
|
+
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
|
13390
|
+
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
|
13391
|
+
left_reminder_length--;
|
|
13392
|
+
}
|
|
13393
|
+
}
|
|
13394
|
+
|
|
13395
|
+
if (left_reminder_length > 0) {
|
|
13396
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
|
13397
|
+
it++;
|
|
13398
|
+
}
|
|
13296
13399
|
|
|
13297
13400
|
#ifdef PRETOKENIZERDEBUG
|
|
13298
13401
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
|
13299
13402
|
#endif
|
|
13300
|
-
it++;
|
|
13301
13403
|
}
|
|
13302
13404
|
|
|
13303
13405
|
// special token
|
|
@@ -13306,16 +13408,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
13306
13408
|
|
|
13307
13409
|
// right
|
|
13308
13410
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
|
13309
|
-
|
|
13310
|
-
|
|
13311
|
-
|
|
13411
|
+
int64_t right_reminder_offset = match + special_token.length();
|
|
13412
|
+
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
|
13413
|
+
|
|
13414
|
+
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
|
13415
|
+
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
|
13416
|
+
right_reminder_offset++;
|
|
13417
|
+
right_reminder_length--;
|
|
13418
|
+
}
|
|
13419
|
+
}
|
|
13420
|
+
|
|
13421
|
+
if (right_reminder_length > 0) {
|
|
13422
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
|
13423
|
+
it++;
|
|
13424
|
+
}
|
|
13312
13425
|
|
|
13313
13426
|
#ifdef PRETOKENIZERDEBUG
|
|
13314
13427
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
|
13315
13428
|
#endif
|
|
13316
13429
|
|
|
13317
|
-
it++;
|
|
13318
|
-
|
|
13319
13430
|
if (source == 0) {
|
|
13320
13431
|
buffer.erase_after(buffer.before_begin());
|
|
13321
13432
|
} else {
|
|
@@ -13361,9 +13472,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13361
13472
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
|
13362
13473
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
|
13363
13474
|
|
|
13364
|
-
static const bool rtrim = true; //TODO: as param
|
|
13365
13475
|
bool is_prev_special = false;
|
|
13366
|
-
bool special_token_rtrim = false;
|
|
13367
13476
|
|
|
13368
13477
|
if (add_special && vocab.special_add_bos != 0) {
|
|
13369
13478
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
@@ -13373,25 +13482,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13373
13482
|
|
|
13374
13483
|
for (const auto & fragment : fragment_buffer) {
|
|
13375
13484
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
13376
|
-
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
|
13377
|
-
|
|
13378
|
-
// TODO: It's likely possible to get rid of this string copy entirely
|
|
13379
|
-
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
|
13380
|
-
// and passing 'add space prefix' as bool argument
|
|
13381
|
-
//
|
|
13382
13485
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
13383
13486
|
|
|
13384
|
-
if (special_token_rtrim) {
|
|
13385
|
-
size_t num_whitespaces = 0;
|
|
13386
|
-
while (isspace(raw_text[num_whitespaces])) {
|
|
13387
|
-
num_whitespaces++;
|
|
13388
|
-
}
|
|
13389
|
-
if (num_whitespaces == raw_text.size()) {
|
|
13390
|
-
continue; // skip if all whitespaces
|
|
13391
|
-
}
|
|
13392
|
-
raw_text = raw_text.substr(num_whitespaces);
|
|
13393
|
-
}
|
|
13394
|
-
|
|
13395
13487
|
if (vocab.add_space_prefix) {
|
|
13396
13488
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
|
13397
13489
|
raw_text = " " + raw_text;
|
|
@@ -13407,11 +13499,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13407
13499
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
13408
13500
|
output.push_back(fragment.token);
|
|
13409
13501
|
is_prev_special = true;
|
|
13410
|
-
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
|
13411
|
-
special_token_rtrim = rtrim
|
|
13412
|
-
&& fragment.token != vocab.special_bos_id
|
|
13413
|
-
&& fragment.token != vocab.special_unk_id
|
|
13414
|
-
&& fragment.token != vocab.special_eos_id;
|
|
13415
13502
|
}
|
|
13416
13503
|
}
|
|
13417
13504
|
|
|
@@ -13574,7 +13661,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
|
13574
13661
|
const uint32_t chr) {
|
|
13575
13662
|
|
|
13576
13663
|
bool found = false;
|
|
13577
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
13664
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
|
13578
13665
|
|
|
13579
13666
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
|
13580
13667
|
|
|
@@ -13583,6 +13670,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
|
13583
13670
|
// inclusive range, e.g. [a-z]
|
|
13584
13671
|
found = found || (pos->value <= chr && chr <= pos[1].value);
|
|
13585
13672
|
pos += 2;
|
|
13673
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
|
13674
|
+
// Any character matches "."
|
|
13675
|
+
found = true;
|
|
13676
|
+
pos += 1;
|
|
13586
13677
|
} else {
|
|
13587
13678
|
// exact char match, e.g. [a] or "a"
|
|
13588
13679
|
found = found || pos->value == chr;
|
|
@@ -13600,7 +13691,7 @@ static bool llama_grammar_match_partial_char(
|
|
|
13600
13691
|
const llama_grammar_element * pos,
|
|
13601
13692
|
const llama_partial_utf8 partial_utf8) {
|
|
13602
13693
|
|
|
13603
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
13694
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
|
13604
13695
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
|
13605
13696
|
|
|
13606
13697
|
uint32_t partial_value = partial_utf8.value;
|
|
@@ -13630,6 +13721,9 @@ static bool llama_grammar_match_partial_char(
|
|
|
13630
13721
|
return is_positive_char;
|
|
13631
13722
|
}
|
|
13632
13723
|
pos += 2;
|
|
13724
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
|
13725
|
+
// Any character matches "."
|
|
13726
|
+
return true;
|
|
13633
13727
|
} else {
|
|
13634
13728
|
// exact char match, e.g. [a] or "a"
|
|
13635
13729
|
if (low <= pos->value && pos->value <= high) {
|
|
@@ -13690,6 +13784,7 @@ static void llama_grammar_advance_stack(
|
|
|
13690
13784
|
}
|
|
13691
13785
|
case LLAMA_GRETYPE_CHAR:
|
|
13692
13786
|
case LLAMA_GRETYPE_CHAR_NOT:
|
|
13787
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
|
13693
13788
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
|
13694
13789
|
// only add the stack if it's not a duplicate of one we already have
|
|
13695
13790
|
new_stacks.emplace_back(stack);
|
|
@@ -14646,260 +14741,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
|
14646
14741
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
14647
14742
|
}
|
|
14648
14743
|
|
|
14649
|
-
//
|
|
14650
|
-
// Beam search
|
|
14651
|
-
//
|
|
14652
|
-
|
|
14653
|
-
struct llama_beam {
|
|
14654
|
-
std::vector<llama_token> tokens;
|
|
14655
|
-
float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
14656
|
-
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
|
14657
|
-
// Sort beams by probability. In case of ties, prefer beams at eob.
|
|
14658
|
-
bool operator<(const llama_beam & rhs) const {
|
|
14659
|
-
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
|
14660
|
-
}
|
|
14661
|
-
// Shift off first n tokens and discard them.
|
|
14662
|
-
void shift_tokens(const size_t n) {
|
|
14663
|
-
if (n) {
|
|
14664
|
-
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
|
14665
|
-
tokens.resize(tokens.size() - n);
|
|
14666
|
-
}
|
|
14667
|
-
}
|
|
14668
|
-
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
|
14669
|
-
};
|
|
14670
|
-
|
|
14671
|
-
// A struct for calculating logit-related info.
|
|
14672
|
-
struct llama_logit_info {
|
|
14673
|
-
const float * const logits;
|
|
14674
|
-
const int n_vocab;
|
|
14675
|
-
const float max_l;
|
|
14676
|
-
const float normalizer;
|
|
14677
|
-
struct sum_exp {
|
|
14678
|
-
float max_l;
|
|
14679
|
-
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
|
14680
|
-
};
|
|
14681
|
-
llama_logit_info(llama_context * ctx)
|
|
14682
|
-
: logits(llama_get_logits(ctx))
|
|
14683
|
-
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
|
14684
|
-
, max_l(*std::max_element(logits, logits + n_vocab))
|
|
14685
|
-
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
|
14686
|
-
{ }
|
|
14687
|
-
llama_token_data get_token_data(const llama_token token_id) const {
|
|
14688
|
-
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
|
14689
|
-
return {token_id, logits[token_id], p};
|
|
14690
|
-
}
|
|
14691
|
-
// Return top k token_data by logit.
|
|
14692
|
-
std::vector<llama_token_data> top_k(size_t k) {
|
|
14693
|
-
std::vector<llama_token_data> min_heap; // min-heap by logit
|
|
14694
|
-
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
|
14695
|
-
min_heap.reserve(k_min);
|
|
14696
|
-
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
|
14697
|
-
min_heap.push_back(get_token_data(token_id));
|
|
14698
|
-
}
|
|
14699
|
-
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
|
14700
|
-
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14701
|
-
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
|
14702
|
-
if (min_heap.front().logit < logits[token_id]) {
|
|
14703
|
-
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14704
|
-
min_heap.back().id = token_id;
|
|
14705
|
-
min_heap.back().logit = logits[token_id];
|
|
14706
|
-
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14707
|
-
}
|
|
14708
|
-
}
|
|
14709
|
-
return min_heap;
|
|
14710
|
-
}
|
|
14711
|
-
float probability_from_logit(float logit) const {
|
|
14712
|
-
return normalizer * std::exp(logit - max_l);
|
|
14713
|
-
}
|
|
14714
|
-
};
|
|
14715
|
-
|
|
14716
|
-
struct llama_beam_search_data {
|
|
14717
|
-
llama_context * ctx;
|
|
14718
|
-
size_t n_beams;
|
|
14719
|
-
int n_past;
|
|
14720
|
-
int n_predict;
|
|
14721
|
-
std::vector<llama_beam> beams;
|
|
14722
|
-
std::vector<llama_beam> next_beams;
|
|
14723
|
-
|
|
14724
|
-
// Re-calculated on each loop iteration
|
|
14725
|
-
size_t common_prefix_length;
|
|
14726
|
-
|
|
14727
|
-
// Used to communicate to/from callback on beams state.
|
|
14728
|
-
std::vector<llama_beam_view> beam_views;
|
|
14729
|
-
|
|
14730
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
|
14731
|
-
: ctx(ctx)
|
|
14732
|
-
, n_beams(n_beams)
|
|
14733
|
-
, n_past(n_past)
|
|
14734
|
-
, n_predict(n_predict)
|
|
14735
|
-
, beam_views(n_beams) {
|
|
14736
|
-
beams.reserve(n_beams);
|
|
14737
|
-
next_beams.reserve(n_beams);
|
|
14738
|
-
}
|
|
14739
|
-
|
|
14740
|
-
// Collapse beams to a single beam given by index.
|
|
14741
|
-
void collapse_beams(const size_t beam_idx) {
|
|
14742
|
-
if (0u < beam_idx) {
|
|
14743
|
-
std::swap(beams[0], beams[beam_idx]);
|
|
14744
|
-
}
|
|
14745
|
-
beams.resize(1);
|
|
14746
|
-
}
|
|
14747
|
-
|
|
14748
|
-
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
|
14749
|
-
// The repetitive patterns below reflect the 2 stages of heaps:
|
|
14750
|
-
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
|
14751
|
-
// * If the heap is full and a new element is found that should be included, pop the
|
|
14752
|
-
// least element to the back(), replace it with the new, then push it into the heap.
|
|
14753
|
-
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
|
14754
|
-
// Min-heaps use a greater-than comparator.
|
|
14755
|
-
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
|
14756
|
-
if (beam.eob) {
|
|
14757
|
-
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
|
14758
|
-
if (next_beams.size() < n_beams) {
|
|
14759
|
-
next_beams.push_back(std::move(beam));
|
|
14760
|
-
if (next_beams.size() == n_beams) {
|
|
14761
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14762
|
-
}
|
|
14763
|
-
} else if (next_beams.front().p < beam.p) {
|
|
14764
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14765
|
-
next_beams.back() = std::move(beam);
|
|
14766
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14767
|
-
}
|
|
14768
|
-
} else {
|
|
14769
|
-
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
|
14770
|
-
if (!beam.tokens.empty()) {
|
|
14771
|
-
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
|
14772
|
-
}
|
|
14773
|
-
llama_logit_info logit_info(ctx);
|
|
14774
|
-
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
|
14775
|
-
|
|
14776
|
-
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
|
14777
|
-
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
|
14778
|
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
14779
|
-
|
|
14780
|
-
size_t i=0;
|
|
14781
|
-
if (next_beams.size() < n_beams) {
|
|
14782
|
-
for (; next_beams.size() < n_beams ; ++i) {
|
|
14783
|
-
llama_beam next_beam = beam;
|
|
14784
|
-
next_beam.tokens.push_back(next_tokens[i].id);
|
|
14785
|
-
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14786
|
-
next_beams.push_back(std::move(next_beam));
|
|
14787
|
-
}
|
|
14788
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14789
|
-
} else {
|
|
14790
|
-
for (; next_beams.front().p == 0.0f ; ++i) {
|
|
14791
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14792
|
-
next_beams.back() = beam;
|
|
14793
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
14794
|
-
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14795
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14796
|
-
}
|
|
14797
|
-
}
|
|
14798
|
-
for (; i < n_beams ; ++i) {
|
|
14799
|
-
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14800
|
-
if (next_beams.front().p < next_p) {
|
|
14801
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14802
|
-
next_beams.back() = beam;
|
|
14803
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
14804
|
-
next_beams.back().p = next_p;
|
|
14805
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14806
|
-
}
|
|
14807
|
-
}
|
|
14808
|
-
}
|
|
14809
|
-
}
|
|
14810
|
-
|
|
14811
|
-
// Find common_prefix_length based on beams.
|
|
14812
|
-
// Requires beams is not empty.
|
|
14813
|
-
size_t find_common_prefix_length() {
|
|
14814
|
-
size_t common_prefix_length = beams[0].tokens.size();
|
|
14815
|
-
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
|
14816
|
-
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
|
14817
|
-
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
|
14818
|
-
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
|
14819
|
-
common_prefix_length = j;
|
|
14820
|
-
break;
|
|
14821
|
-
}
|
|
14822
|
-
}
|
|
14823
|
-
}
|
|
14824
|
-
return common_prefix_length;
|
|
14825
|
-
}
|
|
14826
|
-
|
|
14827
|
-
// Construct beams_state to send back to caller via the callback function.
|
|
14828
|
-
// Side effect: set common_prefix_length = find_common_prefix_length();
|
|
14829
|
-
llama_beams_state get_beams_state(const bool last_call) {
|
|
14830
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
14831
|
-
beam_views[i] = beams[i].view();
|
|
14832
|
-
}
|
|
14833
|
-
common_prefix_length = find_common_prefix_length();
|
|
14834
|
-
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
|
14835
|
-
}
|
|
14836
|
-
|
|
14837
|
-
// Loop:
|
|
14838
|
-
// * while i < n_predict, AND
|
|
14839
|
-
// * any of the beams have not yet reached end-of-beam (eob), AND
|
|
14840
|
-
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
|
14841
|
-
// (since all other beam probabilities can only decrease)
|
|
14842
|
-
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
|
14843
|
-
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
|
14844
|
-
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
|
14845
|
-
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
|
14846
|
-
!beams[top_beam_index()].eob ; ++i) {
|
|
14847
|
-
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
|
14848
|
-
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
|
14849
|
-
if (common_prefix_length) {
|
|
14850
|
-
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
|
14851
|
-
n_past += common_prefix_length;
|
|
14852
|
-
}
|
|
14853
|
-
// Zero-out next_beam probabilities to place them last in following min-heap.
|
|
14854
|
-
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
|
14855
|
-
for (llama_beam & beam : beams) {
|
|
14856
|
-
beam.shift_tokens(common_prefix_length);
|
|
14857
|
-
fill_next_beams_by_top_probabilities(beam);
|
|
14858
|
-
}
|
|
14859
|
-
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
|
14860
|
-
beams.swap(next_beams);
|
|
14861
|
-
renormalize_beam_probabilities(beams);
|
|
14862
|
-
}
|
|
14863
|
-
collapse_beams(top_beam_index());
|
|
14864
|
-
callback(callback_data, get_beams_state(true));
|
|
14865
|
-
}
|
|
14866
|
-
|
|
14867
|
-
// As beams grow, the cumulative probabilities decrease.
|
|
14868
|
-
// Renormalize them to avoid floating point underflow.
|
|
14869
|
-
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
|
14870
|
-
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
|
14871
|
-
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
|
14872
|
-
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
|
14873
|
-
}
|
|
14874
|
-
|
|
14875
|
-
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
|
14876
|
-
size_t top_beam_index() {
|
|
14877
|
-
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
|
14878
|
-
}
|
|
14879
|
-
|
|
14880
|
-
// Copy (p,eob) for each beam which may have been changed by the callback.
|
|
14881
|
-
void update_beams_from_beam_views() {
|
|
14882
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
14883
|
-
beams[i].p = beam_views[i].p;
|
|
14884
|
-
beams[i].eob = beam_views[i].eob;
|
|
14885
|
-
}
|
|
14886
|
-
}
|
|
14887
|
-
};
|
|
14888
|
-
|
|
14889
|
-
void llama_beam_search(llama_context * ctx,
|
|
14890
|
-
llama_beam_search_callback_fn_t callback, void * callback_data,
|
|
14891
|
-
size_t n_beams, int n_past, int n_predict) {
|
|
14892
|
-
assert(ctx);
|
|
14893
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
|
14894
|
-
|
|
14895
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
|
14896
|
-
|
|
14897
|
-
beam_search_data.loop(callback, callback_data);
|
|
14898
|
-
|
|
14899
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
14900
|
-
ctx->n_sample++;
|
|
14901
|
-
}
|
|
14902
|
-
|
|
14903
14744
|
//
|
|
14904
14745
|
// quantization
|
|
14905
14746
|
//
|
|
@@ -15417,6 +15258,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
15417
15258
|
if (imatrix_data) {
|
|
15418
15259
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
|
15419
15260
|
qs.has_imatrix = true;
|
|
15261
|
+
// check imatrix for nans or infs
|
|
15262
|
+
for (const auto & kv : *imatrix_data) {
|
|
15263
|
+
for (float f : kv.second) {
|
|
15264
|
+
if (!std::isfinite(f)) {
|
|
15265
|
+
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
|
|
15266
|
+
}
|
|
15267
|
+
}
|
|
15268
|
+
}
|
|
15420
15269
|
}
|
|
15421
15270
|
}
|
|
15422
15271
|
|
|
@@ -16110,7 +15959,7 @@ bool llama_supports_mlock(void) {
|
|
|
16110
15959
|
}
|
|
16111
15960
|
|
|
16112
15961
|
bool llama_supports_gpu_offload(void) {
|
|
16113
|
-
#if defined(GGML_USE_CUDA) || defined(
|
|
15962
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
|
16114
15963
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
|
16115
15964
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
16116
15965
|
return true;
|
|
@@ -16167,7 +16016,7 @@ struct llama_model * llama_load_model_from_file(
|
|
|
16167
16016
|
return true;
|
|
16168
16017
|
};
|
|
16169
16018
|
}
|
|
16170
|
-
if (params.rpc_servers != nullptr) {
|
|
16019
|
+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
|
16171
16020
|
// split the servers set them into model->rpc_servers
|
|
16172
16021
|
std::string servers(params.rpc_servers);
|
|
16173
16022
|
size_t pos = 0;
|
|
@@ -16221,6 +16070,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16221
16070
|
params.flash_attn = false;
|
|
16222
16071
|
}
|
|
16223
16072
|
|
|
16073
|
+
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
|
16074
|
+
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
|
16075
|
+
return nullptr;
|
|
16076
|
+
}
|
|
16077
|
+
|
|
16224
16078
|
llama_context * ctx = new llama_context(*model);
|
|
16225
16079
|
|
|
16226
16080
|
const auto & hparams = model->hparams;
|
|
@@ -16259,8 +16113,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16259
16113
|
|
|
16260
16114
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
16261
16115
|
|
|
16262
|
-
cparams.
|
|
16263
|
-
hparams.
|
|
16116
|
+
cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
|
16117
|
+
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
|
|
16264
16118
|
hparams.n_ctx_train;
|
|
16265
16119
|
|
|
16266
16120
|
cparams.cb_eval = params.cb_eval;
|
|
@@ -16325,17 +16179,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16325
16179
|
|
|
16326
16180
|
if (!hparams.vocab_only) {
|
|
16327
16181
|
// initialize backends
|
|
16328
|
-
#if defined(
|
|
16329
|
-
for (auto & server : model->rpc_servers) {
|
|
16330
|
-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
16331
|
-
if (backend == nullptr) {
|
|
16332
|
-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
16333
|
-
llama_free(ctx);
|
|
16334
|
-
return nullptr;
|
|
16335
|
-
}
|
|
16336
|
-
ctx->backends.push_back(backend);
|
|
16337
|
-
}
|
|
16338
|
-
#elif defined(GGML_USE_METAL)
|
|
16182
|
+
#if defined(GGML_USE_METAL)
|
|
16339
16183
|
if (model->n_gpu_layers > 0) {
|
|
16340
16184
|
ctx->backend_metal = ggml_backend_metal_init();
|
|
16341
16185
|
if (ctx->backend_metal == nullptr) {
|
|
@@ -16374,7 +16218,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16374
16218
|
return nullptr;
|
|
16375
16219
|
}
|
|
16376
16220
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
16377
|
-
ggml_backend_t backend = ggml_backend_vk_init(
|
|
16221
|
+
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
|
|
16378
16222
|
if (backend == nullptr) {
|
|
16379
16223
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
|
16380
16224
|
llama_free(ctx);
|
|
@@ -16428,6 +16272,29 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16428
16272
|
ctx->backends.push_back(backend);
|
|
16429
16273
|
}
|
|
16430
16274
|
#endif
|
|
16275
|
+
|
|
16276
|
+
#ifdef GGML_USE_BLAS
|
|
16277
|
+
ctx->backend_blas = ggml_backend_blas_init();
|
|
16278
|
+
if (ctx->backend_blas == nullptr) {
|
|
16279
|
+
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
|
16280
|
+
} else {
|
|
16281
|
+
ctx->backends.push_back(ctx->backend_blas);
|
|
16282
|
+
}
|
|
16283
|
+
#endif
|
|
16284
|
+
|
|
16285
|
+
#if defined(GGML_USE_RPC)
|
|
16286
|
+
if (model->n_gpu_layers > 0) {
|
|
16287
|
+
for (const auto & endpoint : model->rpc_servers) {
|
|
16288
|
+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
|
16289
|
+
if (backend == nullptr) {
|
|
16290
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
|
16291
|
+
llama_free(ctx);
|
|
16292
|
+
return nullptr;
|
|
16293
|
+
}
|
|
16294
|
+
ctx->backends.push_back(backend);
|
|
16295
|
+
}
|
|
16296
|
+
}
|
|
16297
|
+
#endif
|
|
16431
16298
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
|
16432
16299
|
if (ctx->backend_cpu == nullptr) {
|
|
16433
16300
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
|
@@ -18209,9 +18076,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
|
|
18209
18076
|
return model->vocab.id_to_token[token].score;
|
|
18210
18077
|
}
|
|
18211
18078
|
|
|
18212
|
-
|
|
18079
|
+
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
|
18213
18080
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
18214
|
-
return model->vocab.id_to_token[token].
|
|
18081
|
+
return model->vocab.id_to_token[token].attr;
|
|
18215
18082
|
}
|
|
18216
18083
|
|
|
18217
18084
|
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
@@ -18313,9 +18180,14 @@ static std::string llama_decode_text(const std::string & text) {
|
|
|
18313
18180
|
|
|
18314
18181
|
// does not write null-terminator to buf
|
|
18315
18182
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
|
18183
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
|
18184
|
+
if (!special && llama_is_control_token(model->vocab, token)) {
|
|
18185
|
+
return 0;
|
|
18186
|
+
}
|
|
18187
|
+
|
|
18316
18188
|
// if we have a cache - use it
|
|
18317
18189
|
{
|
|
18318
|
-
const auto & cache =
|
|
18190
|
+
const auto & cache = model->vocab.cache_token_to_piece;
|
|
18319
18191
|
|
|
18320
18192
|
if (!cache.empty()) {
|
|
18321
18193
|
const auto & res = cache.at(token);
|