llama_cpp 0.15.4 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +15 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +13 -1
- data/vendor/tmp/llama.cpp/Makefile +62 -35
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
- data/vendor/tmp/llama.cpp/ggml.c +178 -330
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +242 -426
- data/vendor/tmp/llama.cpp/llama.h +17 -43
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -13,8 +13,6 @@
|
|
|
13
13
|
|
|
14
14
|
#ifdef GGML_USE_CUDA
|
|
15
15
|
# include "ggml-cuda.h"
|
|
16
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
17
|
-
# include "ggml-opencl.h"
|
|
18
16
|
#elif defined(GGML_USE_VULKAN)
|
|
19
17
|
# include "ggml-vulkan.h"
|
|
20
18
|
#elif defined(GGML_USE_SYCL)
|
|
@@ -110,7 +108,7 @@
|
|
|
110
108
|
//
|
|
111
109
|
|
|
112
110
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
113
|
-
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
|
111
|
+
static void llama_log_internal (ggml_log_level level, const char * format, ...);
|
|
114
112
|
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
|
115
113
|
|
|
116
114
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
@@ -1850,7 +1848,7 @@ struct llama_hparams {
|
|
|
1850
1848
|
float rope_attn_factor = 1.0f;
|
|
1851
1849
|
float rope_freq_base_train;
|
|
1852
1850
|
float rope_freq_scale_train;
|
|
1853
|
-
uint32_t
|
|
1851
|
+
uint32_t n_ctx_orig_yarn;
|
|
1854
1852
|
float rope_yarn_log_mul;
|
|
1855
1853
|
|
|
1856
1854
|
// for State Space Models
|
|
@@ -1892,7 +1890,7 @@ struct llama_hparams {
|
|
|
1892
1890
|
if (this->n_expert_shared != other.n_expert_shared) return true;
|
|
1893
1891
|
|
|
1894
1892
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
1895
|
-
if (this->
|
|
1893
|
+
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
|
1896
1894
|
|
|
1897
1895
|
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
|
1898
1896
|
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
|
@@ -1951,7 +1949,7 @@ struct llama_cparams {
|
|
|
1951
1949
|
float rope_freq_base;
|
|
1952
1950
|
float rope_freq_scale;
|
|
1953
1951
|
|
|
1954
|
-
uint32_t
|
|
1952
|
+
uint32_t n_ctx_orig_yarn;
|
|
1955
1953
|
// These hyperparameters are not exposed in GGUF, because all
|
|
1956
1954
|
// existing YaRN models use the same values for them.
|
|
1957
1955
|
float yarn_ext_factor;
|
|
@@ -2149,12 +2147,12 @@ struct llama_control_vector {
|
|
|
2149
2147
|
struct llama_vocab {
|
|
2150
2148
|
using id = int32_t;
|
|
2151
2149
|
using token = std::string;
|
|
2152
|
-
using
|
|
2150
|
+
using tattr = llama_token_attr;
|
|
2153
2151
|
|
|
2154
2152
|
struct token_data {
|
|
2155
2153
|
token text;
|
|
2156
2154
|
float score;
|
|
2157
|
-
|
|
2155
|
+
tattr attr;
|
|
2158
2156
|
};
|
|
2159
2157
|
|
|
2160
2158
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
@@ -2164,8 +2162,7 @@ struct llama_vocab {
|
|
|
2164
2162
|
std::vector<token_data> id_to_token;
|
|
2165
2163
|
|
|
2166
2164
|
std::vector<id> cache_special_tokens;
|
|
2167
|
-
std::vector<token> cache_token_to_piece;
|
|
2168
|
-
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
|
2165
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
|
2169
2166
|
|
|
2170
2167
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
|
2171
2168
|
|
|
@@ -2372,13 +2369,34 @@ struct llama_context {
|
|
|
2372
2369
|
struct llama_control_vector cvec;
|
|
2373
2370
|
};
|
|
2374
2371
|
|
|
2372
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
|
2373
|
+
size_t count = 1;
|
|
2374
|
+
#if defined(GGML_USE_CUDA)
|
|
2375
|
+
count = ggml_backend_cuda_get_device_count();
|
|
2376
|
+
#elif defined(GGML_USE_SYCL)
|
|
2377
|
+
count = ggml_backend_sycl_get_device_count();
|
|
2378
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2379
|
+
count = ggml_backend_vk_get_device_count();
|
|
2380
|
+
#endif
|
|
2381
|
+
#if defined(GGML_USE_RPC)
|
|
2382
|
+
count += model.rpc_servers.size();
|
|
2383
|
+
#endif
|
|
2384
|
+
return count;
|
|
2385
|
+
GGML_UNUSED(model);
|
|
2386
|
+
}
|
|
2387
|
+
|
|
2375
2388
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
|
2376
2389
|
ggml_backend_buffer_type_t buft = nullptr;
|
|
2377
2390
|
|
|
2378
|
-
#
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2391
|
+
#if defined(GGML_USE_RPC)
|
|
2392
|
+
int dev_count = (int)llama_get_device_count(model);
|
|
2393
|
+
int rpc_count = (int)model.rpc_servers.size();
|
|
2394
|
+
if (gpu >= dev_count - rpc_count) {
|
|
2395
|
+
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
|
2396
|
+
return ggml_backend_rpc_buffer_type(endpoint);
|
|
2397
|
+
}
|
|
2398
|
+
#endif
|
|
2399
|
+
#if defined(GGML_USE_METAL)
|
|
2382
2400
|
buft = ggml_backend_metal_buffer_type();
|
|
2383
2401
|
#elif defined(GGML_USE_CUDA)
|
|
2384
2402
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
@@ -2386,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|
|
2386
2404
|
buft = ggml_backend_vk_buffer_type(gpu);
|
|
2387
2405
|
#elif defined(GGML_USE_SYCL)
|
|
2388
2406
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
2389
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
2390
|
-
buft = ggml_backend_opencl_buffer_type();
|
|
2391
2407
|
#elif defined(GGML_USE_KOMPUTE)
|
|
2392
2408
|
buft = ggml_backend_kompute_buffer_type(gpu);
|
|
2393
2409
|
if (buft == nullptr) {
|
|
@@ -2426,29 +2442,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|
|
2426
2442
|
GGML_UNUSED(tensor_split);
|
|
2427
2443
|
}
|
|
2428
2444
|
|
|
2429
|
-
static size_t llama_get_device_count(const llama_model & model) {
|
|
2430
|
-
#if defined(GGML_USE_RPC)
|
|
2431
|
-
return model.rpc_servers.size();
|
|
2432
|
-
#elif defined(GGML_USE_CUDA)
|
|
2433
|
-
return ggml_backend_cuda_get_device_count();
|
|
2434
|
-
#elif defined(GGML_USE_SYCL)
|
|
2435
|
-
return ggml_backend_sycl_get_device_count();
|
|
2436
|
-
#elif defined(GGML_USE_VULKAN)
|
|
2437
|
-
return ggml_backend_vk_get_device_count();
|
|
2438
|
-
#else
|
|
2439
|
-
return 1;
|
|
2440
|
-
#endif
|
|
2441
|
-
GGML_UNUSED(model);
|
|
2442
|
-
}
|
|
2443
|
-
|
|
2444
2445
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
2445
2446
|
#if defined(GGML_USE_RPC)
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2447
|
+
int dev_count = (int)llama_get_device_count(model);
|
|
2448
|
+
int rpc_count = (int)model.rpc_servers.size();
|
|
2449
|
+
if (device >= dev_count - rpc_count) {
|
|
2450
|
+
size_t total;
|
|
2451
|
+
size_t free;
|
|
2452
|
+
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
|
2453
|
+
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
|
2454
|
+
return free;
|
|
2455
|
+
}
|
|
2456
|
+
#endif
|
|
2457
|
+
#if defined(GGML_USE_CUDA)
|
|
2452
2458
|
size_t total;
|
|
2453
2459
|
size_t free;
|
|
2454
2460
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
|
@@ -2520,10 +2526,6 @@ static bool llama_kv_cache_init(
|
|
|
2520
2526
|
}
|
|
2521
2527
|
}
|
|
2522
2528
|
|
|
2523
|
-
#ifdef GGML_USE_CLBLAST
|
|
2524
|
-
offload = false;
|
|
2525
|
-
#endif
|
|
2526
|
-
|
|
2527
2529
|
// count used buffer types
|
|
2528
2530
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
|
2529
2531
|
if (offload) {
|
|
@@ -4003,8 +4005,8 @@ static void llm_load_hparams(
|
|
|
4003
4005
|
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
4004
4006
|
hparams.rope_finetuned = rope_finetuned;
|
|
4005
4007
|
|
|
4006
|
-
hparams.
|
|
4007
|
-
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.
|
|
4008
|
+
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
|
4009
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
|
4008
4010
|
|
|
4009
4011
|
// rope_freq_base (optional)
|
|
4010
4012
|
hparams.rope_freq_base_train = 10000.0f;
|
|
@@ -4740,7 +4742,20 @@ static void llm_load_vocab(
|
|
|
4740
4742
|
auto & token_data = vocab.id_to_token[i];
|
|
4741
4743
|
token_data.text = std::move(word);
|
|
4742
4744
|
token_data.score = scores ? scores[i] : 0.0f;
|
|
4743
|
-
token_data.
|
|
4745
|
+
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
|
4746
|
+
|
|
4747
|
+
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
|
4748
|
+
switch(toktypes[i]) {
|
|
4749
|
+
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
|
4750
|
+
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
|
4751
|
+
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
|
4752
|
+
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
|
4753
|
+
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
|
4754
|
+
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
|
4755
|
+
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
4756
|
+
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
4757
|
+
}
|
|
4758
|
+
}
|
|
4744
4759
|
}
|
|
4745
4760
|
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
|
4746
4761
|
|
|
@@ -4831,7 +4846,7 @@ static void llm_load_vocab(
|
|
|
4831
4846
|
// build special tokens cache
|
|
4832
4847
|
{
|
|
4833
4848
|
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
|
4834
|
-
if (vocab.id_to_token[id].
|
|
4849
|
+
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
|
4835
4850
|
vocab.cache_special_tokens.push_back(id);
|
|
4836
4851
|
}
|
|
4837
4852
|
}
|
|
@@ -4845,26 +4860,75 @@ static void llm_load_vocab(
|
|
|
4845
4860
|
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
|
4846
4861
|
}
|
|
4847
4862
|
|
|
4848
|
-
// build token to piece
|
|
4863
|
+
// build token to piece cache
|
|
4849
4864
|
{
|
|
4850
4865
|
size_t size_cache = 0;
|
|
4851
4866
|
|
|
4852
|
-
std::vector<llama_vocab::token> cache_token_to_piece
|
|
4853
|
-
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
|
4867
|
+
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
|
|
4854
4868
|
|
|
4855
4869
|
for (uint32_t id = 0; id < n_vocab; ++id) {
|
|
4856
|
-
cache_token_to_piece[id]
|
|
4857
|
-
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
|
4870
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
|
|
4858
4871
|
|
|
4859
4872
|
size_cache += cache_token_to_piece[id].size();
|
|
4860
|
-
size_cache += cache_token_to_piece_special[id].size();
|
|
4861
4873
|
}
|
|
4862
4874
|
|
|
4863
|
-
std::swap(vocab.cache_token_to_piece,
|
|
4864
|
-
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
|
4875
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
|
4865
4876
|
|
|
4866
4877
|
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
|
4867
4878
|
}
|
|
4879
|
+
|
|
4880
|
+
// Handle per token attributes
|
|
4881
|
+
//NOTE: Each model customizes per token attributes.
|
|
4882
|
+
//NOTE: Per token attributes are missing from the GGUF file.
|
|
4883
|
+
//TODO: Extract attributes from GGUF file.
|
|
4884
|
+
{
|
|
4885
|
+
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
|
4886
|
+
for (auto substr : substrs) {
|
|
4887
|
+
if (str.find(substr) < std::string::npos) {
|
|
4888
|
+
return true;
|
|
4889
|
+
}
|
|
4890
|
+
}
|
|
4891
|
+
return false;
|
|
4892
|
+
};
|
|
4893
|
+
|
|
4894
|
+
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
|
4895
|
+
uint32_t current = vocab.id_to_token.at(id).attr;
|
|
4896
|
+
current = value ? (current | attr) : (current & ~attr);
|
|
4897
|
+
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
|
4898
|
+
};
|
|
4899
|
+
|
|
4900
|
+
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
|
4901
|
+
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
|
4902
|
+
};
|
|
4903
|
+
|
|
4904
|
+
std::string model_name;
|
|
4905
|
+
std::string tokenizer_pre;
|
|
4906
|
+
|
|
4907
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
|
4908
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
4909
|
+
|
|
4910
|
+
// model name to lowercase
|
|
4911
|
+
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
|
4912
|
+
[] (const std::string::value_type x) {
|
|
4913
|
+
return std::tolower(x);
|
|
4914
|
+
}
|
|
4915
|
+
);
|
|
4916
|
+
|
|
4917
|
+
// set attributes by model/tokenizer name
|
|
4918
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
|
4919
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
4920
|
+
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
4921
|
+
for (auto id : vocab.cache_special_tokens) {
|
|
4922
|
+
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
4923
|
+
}
|
|
4924
|
+
for (auto token : {"</s>"}) {
|
|
4925
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
4926
|
+
}
|
|
4927
|
+
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
|
4928
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
|
4929
|
+
}
|
|
4930
|
+
}
|
|
4931
|
+
}
|
|
4868
4932
|
}
|
|
4869
4933
|
|
|
4870
4934
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
@@ -4904,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
4904
4968
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
4905
4969
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
4906
4970
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4907
|
-
LLAMA_LOG_INFO("%s:
|
|
4971
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
4908
4972
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
4909
4973
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
4910
4974
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
@@ -5129,12 +5193,10 @@ static bool llm_load_tensors(
|
|
|
5129
5193
|
// output
|
|
5130
5194
|
{
|
|
5131
5195
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5132
|
-
|
|
5133
|
-
|
|
5134
|
-
|
|
5135
|
-
|
|
5136
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5137
|
-
}
|
|
5196
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5197
|
+
// if output is NULL, init from the input tok embed
|
|
5198
|
+
if (model.output == NULL) {
|
|
5199
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5138
5200
|
}
|
|
5139
5201
|
}
|
|
5140
5202
|
|
|
@@ -7072,7 +7134,7 @@ struct llm_build_context {
|
|
|
7072
7134
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
|
7073
7135
|
const int32_t n_outputs;
|
|
7074
7136
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
|
7075
|
-
const int32_t
|
|
7137
|
+
const int32_t n_ctx_orig;
|
|
7076
7138
|
|
|
7077
7139
|
const bool flash_attn;
|
|
7078
7140
|
|
|
@@ -7121,7 +7183,7 @@ struct llm_build_context {
|
|
|
7121
7183
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
|
7122
7184
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
|
7123
7185
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
|
7124
|
-
|
|
7186
|
+
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
|
7125
7187
|
flash_attn (cparams.flash_attn),
|
|
7126
7188
|
pooling_type (cparams.pooling_type),
|
|
7127
7189
|
rope_type (hparams.rope_type),
|
|
@@ -7179,7 +7241,7 @@ struct llm_build_context {
|
|
|
7179
7241
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
|
7180
7242
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
7181
7243
|
0),
|
|
7182
|
-
lctx.inp_K_shift, rope_factors, n_rot, rope_type,
|
|
7244
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7183
7245
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7184
7246
|
|
|
7185
7247
|
cb(tmp, "K_shifted", il);
|
|
@@ -7288,7 +7350,7 @@ struct llm_build_context {
|
|
|
7288
7350
|
// choose long/short freq factors based on the context size
|
|
7289
7351
|
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
7290
7352
|
|
|
7291
|
-
if (n_ctx_pre_seq > hparams.
|
|
7353
|
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
|
7292
7354
|
return model.layers[il].rope_long;
|
|
7293
7355
|
}
|
|
7294
7356
|
|
|
@@ -7404,14 +7466,14 @@ struct llm_build_context {
|
|
|
7404
7466
|
|
|
7405
7467
|
Qcur = ggml_rope_ext(
|
|
7406
7468
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7407
|
-
n_rot, rope_type,
|
|
7469
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7408
7470
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7409
7471
|
);
|
|
7410
7472
|
cb(Qcur, "Qcur", il);
|
|
7411
7473
|
|
|
7412
7474
|
Kcur = ggml_rope_ext(
|
|
7413
7475
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7414
|
-
n_rot, rope_type,
|
|
7476
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7415
7477
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7416
7478
|
);
|
|
7417
7479
|
cb(Kcur, "Kcur", il);
|
|
@@ -7535,12 +7597,12 @@ struct llm_build_context {
|
|
|
7535
7597
|
case MODEL_7B:
|
|
7536
7598
|
Qcur = ggml_rope_ext(
|
|
7537
7599
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7538
|
-
n_rot, rope_type,
|
|
7600
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7539
7601
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7540
7602
|
);
|
|
7541
7603
|
Kcur = ggml_rope_ext(
|
|
7542
7604
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7543
|
-
n_rot, rope_type,
|
|
7605
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7544
7606
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7545
7607
|
);
|
|
7546
7608
|
break;
|
|
@@ -7647,14 +7709,14 @@ struct llm_build_context {
|
|
|
7647
7709
|
|
|
7648
7710
|
Qcur = ggml_rope_ext(
|
|
7649
7711
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7650
|
-
n_rot, rope_type,
|
|
7712
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7651
7713
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7652
7714
|
);
|
|
7653
7715
|
cb(Qcur, "Qcur", il);
|
|
7654
7716
|
|
|
7655
7717
|
Kcur = ggml_rope_ext(
|
|
7656
7718
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7657
|
-
n_rot, rope_type,
|
|
7719
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7658
7720
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7659
7721
|
);
|
|
7660
7722
|
cb(Kcur, "Kcur", il);
|
|
@@ -7767,13 +7829,13 @@ struct llm_build_context {
|
|
|
7767
7829
|
|
|
7768
7830
|
// using mode = 2 for neox mode
|
|
7769
7831
|
Qcur = ggml_rope_ext(
|
|
7770
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
7832
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
7771
7833
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7772
7834
|
);
|
|
7773
7835
|
cb(Qcur, "Qcur", il);
|
|
7774
7836
|
|
|
7775
7837
|
Kcur = ggml_rope_ext(
|
|
7776
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
7838
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
7777
7839
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7778
7840
|
);
|
|
7779
7841
|
cb(Kcur, "Kcur", il);
|
|
@@ -7891,14 +7953,14 @@ struct llm_build_context {
|
|
|
7891
7953
|
|
|
7892
7954
|
Qcur = ggml_rope_ext(
|
|
7893
7955
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7894
|
-
n_rot, rope_type,
|
|
7956
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7895
7957
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7896
7958
|
);
|
|
7897
7959
|
cb(Qcur, "Qcur", il);
|
|
7898
7960
|
|
|
7899
7961
|
Kcur = ggml_rope_ext(
|
|
7900
7962
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7901
|
-
n_rot, rope_type,
|
|
7963
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7902
7964
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7903
7965
|
);
|
|
7904
7966
|
cb(Kcur, "Kcur", il);
|
|
@@ -8044,14 +8106,14 @@ struct llm_build_context {
|
|
|
8044
8106
|
|
|
8045
8107
|
Qcur = ggml_rope_ext(
|
|
8046
8108
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8047
|
-
n_rot, rope_type,
|
|
8109
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8048
8110
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8049
8111
|
);
|
|
8050
8112
|
cb(Qcur, "Qcur", il);
|
|
8051
8113
|
|
|
8052
8114
|
Kcur = ggml_rope_ext(
|
|
8053
8115
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8054
|
-
n_rot, rope_type,
|
|
8116
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8055
8117
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8056
8118
|
);
|
|
8057
8119
|
cb(Kcur, "Kcur", il);
|
|
@@ -8398,14 +8460,14 @@ struct llm_build_context {
|
|
|
8398
8460
|
|
|
8399
8461
|
Qcur = ggml_rope_ext(
|
|
8400
8462
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8401
|
-
n_rot, rope_type,
|
|
8463
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8402
8464
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8403
8465
|
);
|
|
8404
8466
|
cb(Qcur, "Qcur", il);
|
|
8405
8467
|
|
|
8406
8468
|
Kcur = ggml_rope_ext(
|
|
8407
8469
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8408
|
-
n_rot, rope_type,
|
|
8470
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8409
8471
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8410
8472
|
);
|
|
8411
8473
|
cb(Kcur, "Kcur", il);
|
|
@@ -8838,14 +8900,14 @@ struct llm_build_context {
|
|
|
8838
8900
|
|
|
8839
8901
|
Qcur = ggml_rope_ext(
|
|
8840
8902
|
ctx0, Qcur, inp_pos, nullptr,
|
|
8841
|
-
n_rot, rope_type,
|
|
8903
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8842
8904
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8843
8905
|
);
|
|
8844
8906
|
cb(Qcur, "Qcur", il);
|
|
8845
8907
|
|
|
8846
8908
|
Kcur = ggml_rope_ext(
|
|
8847
8909
|
ctx0, Kcur, inp_pos, nullptr,
|
|
8848
|
-
n_rot, rope_type,
|
|
8910
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8849
8911
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8850
8912
|
);
|
|
8851
8913
|
cb(Kcur, "Kcur", il);
|
|
@@ -8957,13 +9019,13 @@ struct llm_build_context {
|
|
|
8957
9019
|
|
|
8958
9020
|
// using mode = 2 for neox mode
|
|
8959
9021
|
Qcur = ggml_rope_ext(
|
|
8960
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9022
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
8961
9023
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8962
9024
|
);
|
|
8963
9025
|
cb(Qcur, "Qcur", il);
|
|
8964
9026
|
|
|
8965
9027
|
Kcur = ggml_rope_ext(
|
|
8966
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9028
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
8967
9029
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8968
9030
|
);
|
|
8969
9031
|
cb(Kcur, "Kcur", il);
|
|
@@ -9069,14 +9131,14 @@ struct llm_build_context {
|
|
|
9069
9131
|
|
|
9070
9132
|
Qcur = ggml_rope_ext(
|
|
9071
9133
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9072
|
-
n_rot, rope_type,
|
|
9134
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9073
9135
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9074
9136
|
);
|
|
9075
9137
|
cb(Qcur, "Qcur", il);
|
|
9076
9138
|
|
|
9077
9139
|
Kcur = ggml_rope_ext(
|
|
9078
9140
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9079
|
-
n_rot, rope_type,
|
|
9141
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9080
9142
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9081
9143
|
);
|
|
9082
9144
|
cb(Kcur, "Kcur", il);
|
|
@@ -9183,14 +9245,14 @@ struct llm_build_context {
|
|
|
9183
9245
|
|
|
9184
9246
|
Qcur = ggml_rope_ext(
|
|
9185
9247
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9186
|
-
n_rot, rope_type,
|
|
9248
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9187
9249
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9188
9250
|
);
|
|
9189
9251
|
cb(Qcur, "Qcur", il);
|
|
9190
9252
|
|
|
9191
9253
|
Kcur = ggml_rope_ext(
|
|
9192
9254
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9193
|
-
n_rot, rope_type,
|
|
9255
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9194
9256
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9195
9257
|
);
|
|
9196
9258
|
cb(Kcur, "Kcur", il);
|
|
@@ -9335,7 +9397,7 @@ struct llm_build_context {
|
|
|
9335
9397
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9336
9398
|
|
|
9337
9399
|
Qcur = ggml_rope_ext(
|
|
9338
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9400
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
9339
9401
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9340
9402
|
);
|
|
9341
9403
|
cb(Qcur, "Qcur", il);
|
|
@@ -9346,7 +9408,7 @@ struct llm_build_context {
|
|
|
9346
9408
|
cb(Qcur, "Qcur", il);
|
|
9347
9409
|
|
|
9348
9410
|
Kcur = ggml_rope_ext(
|
|
9349
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9411
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
9350
9412
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9351
9413
|
);
|
|
9352
9414
|
cb(Kcur, "Kcur", il);
|
|
@@ -9457,7 +9519,7 @@ struct llm_build_context {
|
|
|
9457
9519
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9458
9520
|
|
|
9459
9521
|
Qcur = ggml_rope_ext(
|
|
9460
|
-
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type,
|
|
9522
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
|
9461
9523
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9462
9524
|
);
|
|
9463
9525
|
cb(Qcur, "Qcur", il);
|
|
@@ -9466,7 +9528,7 @@ struct llm_build_context {
|
|
|
9466
9528
|
cb(Qcur, "Qcur", il);
|
|
9467
9529
|
|
|
9468
9530
|
Kcur = ggml_rope_ext(
|
|
9469
|
-
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type,
|
|
9531
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
|
9470
9532
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9471
9533
|
);
|
|
9472
9534
|
cb(Kcur, "Kcur", il);
|
|
@@ -9574,13 +9636,13 @@ struct llm_build_context {
|
|
|
9574
9636
|
|
|
9575
9637
|
Qcur = ggml_rope_ext(
|
|
9576
9638
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
|
9577
|
-
n_embd_head, rope_type,
|
|
9639
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9578
9640
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9579
9641
|
cb(Qcur, "Qcur", il);
|
|
9580
9642
|
|
|
9581
9643
|
Kcur = ggml_rope_ext(
|
|
9582
9644
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9583
|
-
n_embd_head, rope_type,
|
|
9645
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9584
9646
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9585
9647
|
cb(Kcur, "Kcur", il);
|
|
9586
9648
|
|
|
@@ -9782,14 +9844,14 @@ struct llm_build_context {
|
|
|
9782
9844
|
|
|
9783
9845
|
struct ggml_tensor * Qcur = ggml_rope_ext(
|
|
9784
9846
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9785
|
-
n_rot, rope_type,
|
|
9847
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9786
9848
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9787
9849
|
);
|
|
9788
9850
|
cb(Qcur, "Qcur", il);
|
|
9789
9851
|
|
|
9790
9852
|
struct ggml_tensor * Kcur = ggml_rope_ext(
|
|
9791
9853
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9792
|
-
n_rot, rope_type,
|
|
9854
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9793
9855
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9794
9856
|
);
|
|
9795
9857
|
cb(Kcur, "Kcur", il);
|
|
@@ -9898,14 +9960,14 @@ struct llm_build_context {
|
|
|
9898
9960
|
|
|
9899
9961
|
Qcur = ggml_rope_ext(
|
|
9900
9962
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9901
|
-
n_rot, rope_type,
|
|
9963
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9902
9964
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9903
9965
|
);
|
|
9904
9966
|
cb(Qcur, "Qcur", il);
|
|
9905
9967
|
|
|
9906
9968
|
Kcur = ggml_rope_ext(
|
|
9907
9969
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9908
|
-
n_rot, rope_type,
|
|
9970
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9909
9971
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9910
9972
|
);
|
|
9911
9973
|
cb(Kcur, "Kcur", il);
|
|
@@ -10015,14 +10077,14 @@ struct llm_build_context {
|
|
|
10015
10077
|
|
|
10016
10078
|
Qcur = ggml_rope_ext(
|
|
10017
10079
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10018
|
-
n_rot, rope_type,
|
|
10080
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10019
10081
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10020
10082
|
);
|
|
10021
10083
|
cb(Qcur, "Qcur", il);
|
|
10022
10084
|
|
|
10023
10085
|
Kcur = ggml_rope_ext(
|
|
10024
10086
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10025
|
-
n_rot, rope_type,
|
|
10087
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10026
10088
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10027
10089
|
);
|
|
10028
10090
|
cb(Kcur, "Kcur", il);
|
|
@@ -10145,14 +10207,14 @@ struct llm_build_context {
|
|
|
10145
10207
|
|
|
10146
10208
|
Qcur = ggml_rope_ext(
|
|
10147
10209
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10148
|
-
n_rot, rope_type,
|
|
10210
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10149
10211
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10150
10212
|
);
|
|
10151
10213
|
cb(Qcur, "Qcur", il);
|
|
10152
10214
|
|
|
10153
10215
|
Kcur = ggml_rope_ext(
|
|
10154
10216
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10155
|
-
n_rot, rope_type,
|
|
10217
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10156
10218
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10157
10219
|
);
|
|
10158
10220
|
cb(Kcur, "Kcur", il);
|
|
@@ -10217,7 +10279,7 @@ struct llm_build_context {
|
|
|
10217
10279
|
cb(cur, "lmhead_scaling", -1);
|
|
10218
10280
|
|
|
10219
10281
|
// lm_head
|
|
10220
|
-
cur = ggml_mul_mat(ctx0, model.
|
|
10282
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
10221
10283
|
cb(cur, "result_output", -1);
|
|
10222
10284
|
|
|
10223
10285
|
ggml_build_forward_expand(gf, cur);
|
|
@@ -10265,7 +10327,7 @@ struct llm_build_context {
|
|
|
10265
10327
|
|
|
10266
10328
|
Qcur = ggml_rope_ext(
|
|
10267
10329
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
|
10268
|
-
n_embd_head_k, rope_type,
|
|
10330
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10269
10331
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10270
10332
|
cb(Qcur, "Qcur", il);
|
|
10271
10333
|
|
|
@@ -10274,7 +10336,7 @@ struct llm_build_context {
|
|
|
10274
10336
|
|
|
10275
10337
|
Kcur = ggml_rope_ext(
|
|
10276
10338
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10277
|
-
n_embd_head_k, rope_type,
|
|
10339
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10278
10340
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10279
10341
|
cb(Kcur, "Kcur", il);
|
|
10280
10342
|
|
|
@@ -10385,14 +10447,14 @@ struct llm_build_context {
|
|
|
10385
10447
|
|
|
10386
10448
|
Qcur = ggml_rope_ext(
|
|
10387
10449
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10388
|
-
n_rot, rope_type,
|
|
10450
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10389
10451
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10390
10452
|
);
|
|
10391
10453
|
cb(Qcur, "Qcur", il);
|
|
10392
10454
|
|
|
10393
10455
|
Kcur = ggml_rope_ext(
|
|
10394
10456
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10395
|
-
n_rot, rope_type,
|
|
10457
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10396
10458
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10397
10459
|
);
|
|
10398
10460
|
cb(Kcur, "Kcur", il);
|
|
@@ -10675,14 +10737,14 @@ struct llm_build_context {
|
|
|
10675
10737
|
|
|
10676
10738
|
Qcur = ggml_rope_ext(
|
|
10677
10739
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10678
|
-
n_rot, rope_type,
|
|
10740
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10679
10741
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10680
10742
|
);
|
|
10681
10743
|
cb(Qcur, "Qcur", il);
|
|
10682
10744
|
|
|
10683
10745
|
Kcur = ggml_rope_ext(
|
|
10684
10746
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10685
|
-
n_rot, rope_type,
|
|
10747
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10686
10748
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10687
10749
|
);
|
|
10688
10750
|
cb(Kcur, "Kcur", il);
|
|
@@ -10806,14 +10868,14 @@ struct llm_build_context {
|
|
|
10806
10868
|
|
|
10807
10869
|
Qcur = ggml_rope_ext(
|
|
10808
10870
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10809
|
-
n_rot, rope_type,
|
|
10871
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10810
10872
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10811
10873
|
);
|
|
10812
10874
|
cb(Qcur, "Qcur", il);
|
|
10813
10875
|
|
|
10814
10876
|
Kcur = ggml_rope_ext(
|
|
10815
10877
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10816
|
-
n_rot, rope_type,
|
|
10878
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10817
10879
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10818
10880
|
);
|
|
10819
10881
|
cb(Kcur, "Kcur", il);
|
|
@@ -10920,14 +10982,14 @@ struct llm_build_context {
|
|
|
10920
10982
|
|
|
10921
10983
|
Qcur = ggml_rope_ext(
|
|
10922
10984
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10923
|
-
n_rot, rope_type,
|
|
10985
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10924
10986
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10925
10987
|
);
|
|
10926
10988
|
cb(Qcur, "Qcur", il);
|
|
10927
10989
|
|
|
10928
10990
|
Kcur = ggml_rope_ext(
|
|
10929
10991
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10930
|
-
n_rot, rope_type,
|
|
10992
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10931
10993
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10932
10994
|
);
|
|
10933
10995
|
cb(Kcur, "Kcur", il);
|
|
@@ -11055,14 +11117,14 @@ struct llm_build_context {
|
|
|
11055
11117
|
|
|
11056
11118
|
Qcur = ggml_rope_ext(
|
|
11057
11119
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
11058
|
-
n_rot, rope_type,
|
|
11120
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11059
11121
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11060
11122
|
);
|
|
11061
11123
|
cb(Qcur, "Qcur", il);
|
|
11062
11124
|
|
|
11063
11125
|
Kcur = ggml_rope_ext(
|
|
11064
11126
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
11065
|
-
n_rot, rope_type,
|
|
11127
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11066
11128
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11067
11129
|
);
|
|
11068
11130
|
cb(Kcur, "Kcur", il);
|
|
@@ -11272,7 +11334,7 @@ struct llm_build_context {
|
|
|
11272
11334
|
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
|
11273
11335
|
q_pe = ggml_rope_ext(
|
|
11274
11336
|
ctx0, q_pe, inp_pos, nullptr,
|
|
11275
|
-
n_rot, rope_type,
|
|
11337
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11276
11338
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
11277
11339
|
);
|
|
11278
11340
|
cb(q_pe, "q_pe", il);
|
|
@@ -11281,7 +11343,7 @@ struct llm_build_context {
|
|
|
11281
11343
|
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
|
11282
11344
|
k_pe = ggml_rope_ext(
|
|
11283
11345
|
ctx0, k_pe, inp_pos, nullptr,
|
|
11284
|
-
n_rot, rope_type,
|
|
11346
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11285
11347
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
11286
11348
|
);
|
|
11287
11349
|
cb(k_pe, "k_pe", il);
|
|
@@ -12616,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
|
12616
12678
|
|
|
12617
12679
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
|
12618
12680
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12619
|
-
return vocab.id_to_token[id].
|
|
12681
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
|
12620
12682
|
}
|
|
12621
12683
|
|
|
12622
12684
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
|
12623
12685
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12624
|
-
return vocab.id_to_token[id].
|
|
12686
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
|
12625
12687
|
}
|
|
12626
12688
|
|
|
12627
12689
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
12628
12690
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12629
|
-
return vocab.id_to_token[id].
|
|
12691
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
|
12630
12692
|
}
|
|
12631
12693
|
|
|
12632
12694
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
12633
12695
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12634
|
-
return vocab.id_to_token[id].
|
|
12696
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
|
12635
12697
|
}
|
|
12636
12698
|
|
|
12637
12699
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
|
12638
12700
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12639
|
-
return vocab.id_to_token[id].
|
|
12701
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
12640
12702
|
}
|
|
12641
12703
|
|
|
12642
12704
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
@@ -13254,7 +13316,8 @@ struct fragment_buffer_variant {
|
|
|
13254
13316
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
|
13255
13317
|
// for each special token
|
|
13256
13318
|
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
|
13257
|
-
const auto &
|
|
13319
|
+
const auto & data = vocab.id_to_token[special_id];
|
|
13320
|
+
const auto & special_token = data.text;
|
|
13258
13321
|
|
|
13259
13322
|
// for each text fragment
|
|
13260
13323
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
|
@@ -13291,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
13291
13354
|
if (match > raw_text_base_offset) {
|
|
13292
13355
|
// left
|
|
13293
13356
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
|
13294
|
-
|
|
13295
|
-
|
|
13357
|
+
int64_t left_reminder_length = match - raw_text_base_offset;
|
|
13358
|
+
|
|
13359
|
+
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
|
13360
|
+
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
|
13361
|
+
left_reminder_length--;
|
|
13362
|
+
}
|
|
13363
|
+
}
|
|
13364
|
+
|
|
13365
|
+
if (left_reminder_length > 0) {
|
|
13366
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
|
13367
|
+
it++;
|
|
13368
|
+
}
|
|
13296
13369
|
|
|
13297
13370
|
#ifdef PRETOKENIZERDEBUG
|
|
13298
13371
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
|
13299
13372
|
#endif
|
|
13300
|
-
it++;
|
|
13301
13373
|
}
|
|
13302
13374
|
|
|
13303
13375
|
// special token
|
|
@@ -13306,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
13306
13378
|
|
|
13307
13379
|
// right
|
|
13308
13380
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
|
13309
|
-
|
|
13310
|
-
|
|
13311
|
-
|
|
13381
|
+
int64_t right_reminder_offset = match + special_token.length();
|
|
13382
|
+
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
|
13383
|
+
|
|
13384
|
+
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
|
13385
|
+
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
|
13386
|
+
right_reminder_offset++;
|
|
13387
|
+
right_reminder_length--;
|
|
13388
|
+
}
|
|
13389
|
+
}
|
|
13390
|
+
|
|
13391
|
+
if (right_reminder_length > 0) {
|
|
13392
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
|
13393
|
+
it++;
|
|
13394
|
+
}
|
|
13312
13395
|
|
|
13313
13396
|
#ifdef PRETOKENIZERDEBUG
|
|
13314
13397
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
|
13315
13398
|
#endif
|
|
13316
13399
|
|
|
13317
|
-
it++;
|
|
13318
|
-
|
|
13319
13400
|
if (source == 0) {
|
|
13320
13401
|
buffer.erase_after(buffer.before_begin());
|
|
13321
13402
|
} else {
|
|
@@ -13361,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13361
13442
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
|
13362
13443
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
|
13363
13444
|
|
|
13364
|
-
static const bool rtrim = true; //TODO: as param
|
|
13365
13445
|
bool is_prev_special = false;
|
|
13366
|
-
bool special_token_rtrim = false;
|
|
13367
13446
|
|
|
13368
13447
|
if (add_special && vocab.special_add_bos != 0) {
|
|
13369
13448
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
@@ -13373,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13373
13452
|
|
|
13374
13453
|
for (const auto & fragment : fragment_buffer) {
|
|
13375
13454
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
13376
|
-
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
|
13377
|
-
|
|
13378
|
-
// TODO: It's likely possible to get rid of this string copy entirely
|
|
13379
|
-
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
|
13380
|
-
// and passing 'add space prefix' as bool argument
|
|
13381
|
-
//
|
|
13382
13455
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
13383
13456
|
|
|
13384
|
-
if (special_token_rtrim) {
|
|
13385
|
-
size_t num_whitespaces = 0;
|
|
13386
|
-
while (isspace(raw_text[num_whitespaces])) {
|
|
13387
|
-
num_whitespaces++;
|
|
13388
|
-
}
|
|
13389
|
-
if (num_whitespaces == raw_text.size()) {
|
|
13390
|
-
continue; // skip if all whitespaces
|
|
13391
|
-
}
|
|
13392
|
-
raw_text = raw_text.substr(num_whitespaces);
|
|
13393
|
-
}
|
|
13394
|
-
|
|
13395
13457
|
if (vocab.add_space_prefix) {
|
|
13396
13458
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
|
13397
13459
|
raw_text = " " + raw_text;
|
|
@@ -13407,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13407
13469
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
13408
13470
|
output.push_back(fragment.token);
|
|
13409
13471
|
is_prev_special = true;
|
|
13410
|
-
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
|
13411
|
-
special_token_rtrim = rtrim
|
|
13412
|
-
&& fragment.token != vocab.special_bos_id
|
|
13413
|
-
&& fragment.token != vocab.special_unk_id
|
|
13414
|
-
&& fragment.token != vocab.special_eos_id;
|
|
13415
13472
|
}
|
|
13416
13473
|
}
|
|
13417
13474
|
|
|
@@ -14646,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
|
14646
14703
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
14647
14704
|
}
|
|
14648
14705
|
|
|
14649
|
-
//
|
|
14650
|
-
// Beam search
|
|
14651
|
-
//
|
|
14652
|
-
|
|
14653
|
-
struct llama_beam {
|
|
14654
|
-
std::vector<llama_token> tokens;
|
|
14655
|
-
float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
14656
|
-
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
|
14657
|
-
// Sort beams by probability. In case of ties, prefer beams at eob.
|
|
14658
|
-
bool operator<(const llama_beam & rhs) const {
|
|
14659
|
-
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
|
14660
|
-
}
|
|
14661
|
-
// Shift off first n tokens and discard them.
|
|
14662
|
-
void shift_tokens(const size_t n) {
|
|
14663
|
-
if (n) {
|
|
14664
|
-
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
|
14665
|
-
tokens.resize(tokens.size() - n);
|
|
14666
|
-
}
|
|
14667
|
-
}
|
|
14668
|
-
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
|
14669
|
-
};
|
|
14670
|
-
|
|
14671
|
-
// A struct for calculating logit-related info.
|
|
14672
|
-
struct llama_logit_info {
|
|
14673
|
-
const float * const logits;
|
|
14674
|
-
const int n_vocab;
|
|
14675
|
-
const float max_l;
|
|
14676
|
-
const float normalizer;
|
|
14677
|
-
struct sum_exp {
|
|
14678
|
-
float max_l;
|
|
14679
|
-
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
|
14680
|
-
};
|
|
14681
|
-
llama_logit_info(llama_context * ctx)
|
|
14682
|
-
: logits(llama_get_logits(ctx))
|
|
14683
|
-
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
|
14684
|
-
, max_l(*std::max_element(logits, logits + n_vocab))
|
|
14685
|
-
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
|
14686
|
-
{ }
|
|
14687
|
-
llama_token_data get_token_data(const llama_token token_id) const {
|
|
14688
|
-
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
|
14689
|
-
return {token_id, logits[token_id], p};
|
|
14690
|
-
}
|
|
14691
|
-
// Return top k token_data by logit.
|
|
14692
|
-
std::vector<llama_token_data> top_k(size_t k) {
|
|
14693
|
-
std::vector<llama_token_data> min_heap; // min-heap by logit
|
|
14694
|
-
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
|
14695
|
-
min_heap.reserve(k_min);
|
|
14696
|
-
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
|
14697
|
-
min_heap.push_back(get_token_data(token_id));
|
|
14698
|
-
}
|
|
14699
|
-
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
|
14700
|
-
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14701
|
-
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
|
14702
|
-
if (min_heap.front().logit < logits[token_id]) {
|
|
14703
|
-
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14704
|
-
min_heap.back().id = token_id;
|
|
14705
|
-
min_heap.back().logit = logits[token_id];
|
|
14706
|
-
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14707
|
-
}
|
|
14708
|
-
}
|
|
14709
|
-
return min_heap;
|
|
14710
|
-
}
|
|
14711
|
-
float probability_from_logit(float logit) const {
|
|
14712
|
-
return normalizer * std::exp(logit - max_l);
|
|
14713
|
-
}
|
|
14714
|
-
};
|
|
14715
|
-
|
|
14716
|
-
struct llama_beam_search_data {
|
|
14717
|
-
llama_context * ctx;
|
|
14718
|
-
size_t n_beams;
|
|
14719
|
-
int n_past;
|
|
14720
|
-
int n_predict;
|
|
14721
|
-
std::vector<llama_beam> beams;
|
|
14722
|
-
std::vector<llama_beam> next_beams;
|
|
14723
|
-
|
|
14724
|
-
// Re-calculated on each loop iteration
|
|
14725
|
-
size_t common_prefix_length;
|
|
14726
|
-
|
|
14727
|
-
// Used to communicate to/from callback on beams state.
|
|
14728
|
-
std::vector<llama_beam_view> beam_views;
|
|
14729
|
-
|
|
14730
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
|
14731
|
-
: ctx(ctx)
|
|
14732
|
-
, n_beams(n_beams)
|
|
14733
|
-
, n_past(n_past)
|
|
14734
|
-
, n_predict(n_predict)
|
|
14735
|
-
, beam_views(n_beams) {
|
|
14736
|
-
beams.reserve(n_beams);
|
|
14737
|
-
next_beams.reserve(n_beams);
|
|
14738
|
-
}
|
|
14739
|
-
|
|
14740
|
-
// Collapse beams to a single beam given by index.
|
|
14741
|
-
void collapse_beams(const size_t beam_idx) {
|
|
14742
|
-
if (0u < beam_idx) {
|
|
14743
|
-
std::swap(beams[0], beams[beam_idx]);
|
|
14744
|
-
}
|
|
14745
|
-
beams.resize(1);
|
|
14746
|
-
}
|
|
14747
|
-
|
|
14748
|
-
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
|
14749
|
-
// The repetitive patterns below reflect the 2 stages of heaps:
|
|
14750
|
-
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
|
14751
|
-
// * If the heap is full and a new element is found that should be included, pop the
|
|
14752
|
-
// least element to the back(), replace it with the new, then push it into the heap.
|
|
14753
|
-
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
|
14754
|
-
// Min-heaps use a greater-than comparator.
|
|
14755
|
-
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
|
14756
|
-
if (beam.eob) {
|
|
14757
|
-
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
|
14758
|
-
if (next_beams.size() < n_beams) {
|
|
14759
|
-
next_beams.push_back(std::move(beam));
|
|
14760
|
-
if (next_beams.size() == n_beams) {
|
|
14761
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14762
|
-
}
|
|
14763
|
-
} else if (next_beams.front().p < beam.p) {
|
|
14764
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14765
|
-
next_beams.back() = std::move(beam);
|
|
14766
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14767
|
-
}
|
|
14768
|
-
} else {
|
|
14769
|
-
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
|
14770
|
-
if (!beam.tokens.empty()) {
|
|
14771
|
-
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
|
14772
|
-
}
|
|
14773
|
-
llama_logit_info logit_info(ctx);
|
|
14774
|
-
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
|
14775
|
-
|
|
14776
|
-
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
|
14777
|
-
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
|
14778
|
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
14779
|
-
|
|
14780
|
-
size_t i=0;
|
|
14781
|
-
if (next_beams.size() < n_beams) {
|
|
14782
|
-
for (; next_beams.size() < n_beams ; ++i) {
|
|
14783
|
-
llama_beam next_beam = beam;
|
|
14784
|
-
next_beam.tokens.push_back(next_tokens[i].id);
|
|
14785
|
-
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14786
|
-
next_beams.push_back(std::move(next_beam));
|
|
14787
|
-
}
|
|
14788
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14789
|
-
} else {
|
|
14790
|
-
for (; next_beams.front().p == 0.0f ; ++i) {
|
|
14791
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14792
|
-
next_beams.back() = beam;
|
|
14793
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
14794
|
-
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14795
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14796
|
-
}
|
|
14797
|
-
}
|
|
14798
|
-
for (; i < n_beams ; ++i) {
|
|
14799
|
-
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14800
|
-
if (next_beams.front().p < next_p) {
|
|
14801
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14802
|
-
next_beams.back() = beam;
|
|
14803
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
14804
|
-
next_beams.back().p = next_p;
|
|
14805
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14806
|
-
}
|
|
14807
|
-
}
|
|
14808
|
-
}
|
|
14809
|
-
}
|
|
14810
|
-
|
|
14811
|
-
// Find common_prefix_length based on beams.
|
|
14812
|
-
// Requires beams is not empty.
|
|
14813
|
-
size_t find_common_prefix_length() {
|
|
14814
|
-
size_t common_prefix_length = beams[0].tokens.size();
|
|
14815
|
-
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
|
14816
|
-
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
|
14817
|
-
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
|
14818
|
-
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
|
14819
|
-
common_prefix_length = j;
|
|
14820
|
-
break;
|
|
14821
|
-
}
|
|
14822
|
-
}
|
|
14823
|
-
}
|
|
14824
|
-
return common_prefix_length;
|
|
14825
|
-
}
|
|
14826
|
-
|
|
14827
|
-
// Construct beams_state to send back to caller via the callback function.
|
|
14828
|
-
// Side effect: set common_prefix_length = find_common_prefix_length();
|
|
14829
|
-
llama_beams_state get_beams_state(const bool last_call) {
|
|
14830
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
14831
|
-
beam_views[i] = beams[i].view();
|
|
14832
|
-
}
|
|
14833
|
-
common_prefix_length = find_common_prefix_length();
|
|
14834
|
-
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
|
14835
|
-
}
|
|
14836
|
-
|
|
14837
|
-
// Loop:
|
|
14838
|
-
// * while i < n_predict, AND
|
|
14839
|
-
// * any of the beams have not yet reached end-of-beam (eob), AND
|
|
14840
|
-
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
|
14841
|
-
// (since all other beam probabilities can only decrease)
|
|
14842
|
-
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
|
14843
|
-
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
|
14844
|
-
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
|
14845
|
-
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
|
14846
|
-
!beams[top_beam_index()].eob ; ++i) {
|
|
14847
|
-
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
|
14848
|
-
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
|
14849
|
-
if (common_prefix_length) {
|
|
14850
|
-
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
|
14851
|
-
n_past += common_prefix_length;
|
|
14852
|
-
}
|
|
14853
|
-
// Zero-out next_beam probabilities to place them last in following min-heap.
|
|
14854
|
-
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
|
14855
|
-
for (llama_beam & beam : beams) {
|
|
14856
|
-
beam.shift_tokens(common_prefix_length);
|
|
14857
|
-
fill_next_beams_by_top_probabilities(beam);
|
|
14858
|
-
}
|
|
14859
|
-
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
|
14860
|
-
beams.swap(next_beams);
|
|
14861
|
-
renormalize_beam_probabilities(beams);
|
|
14862
|
-
}
|
|
14863
|
-
collapse_beams(top_beam_index());
|
|
14864
|
-
callback(callback_data, get_beams_state(true));
|
|
14865
|
-
}
|
|
14866
|
-
|
|
14867
|
-
// As beams grow, the cumulative probabilities decrease.
|
|
14868
|
-
// Renormalize them to avoid floating point underflow.
|
|
14869
|
-
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
|
14870
|
-
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
|
14871
|
-
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
|
14872
|
-
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
|
14873
|
-
}
|
|
14874
|
-
|
|
14875
|
-
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
|
14876
|
-
size_t top_beam_index() {
|
|
14877
|
-
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
|
14878
|
-
}
|
|
14879
|
-
|
|
14880
|
-
// Copy (p,eob) for each beam which may have been changed by the callback.
|
|
14881
|
-
void update_beams_from_beam_views() {
|
|
14882
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
14883
|
-
beams[i].p = beam_views[i].p;
|
|
14884
|
-
beams[i].eob = beam_views[i].eob;
|
|
14885
|
-
}
|
|
14886
|
-
}
|
|
14887
|
-
};
|
|
14888
|
-
|
|
14889
|
-
void llama_beam_search(llama_context * ctx,
|
|
14890
|
-
llama_beam_search_callback_fn_t callback, void * callback_data,
|
|
14891
|
-
size_t n_beams, int n_past, int n_predict) {
|
|
14892
|
-
assert(ctx);
|
|
14893
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
|
14894
|
-
|
|
14895
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
|
14896
|
-
|
|
14897
|
-
beam_search_data.loop(callback, callback_data);
|
|
14898
|
-
|
|
14899
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
14900
|
-
ctx->n_sample++;
|
|
14901
|
-
}
|
|
14902
|
-
|
|
14903
14706
|
//
|
|
14904
14707
|
// quantization
|
|
14905
14708
|
//
|
|
@@ -16110,7 +15913,7 @@ bool llama_supports_mlock(void) {
|
|
|
16110
15913
|
}
|
|
16111
15914
|
|
|
16112
15915
|
bool llama_supports_gpu_offload(void) {
|
|
16113
|
-
#if defined(GGML_USE_CUDA) || defined(
|
|
15916
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
|
16114
15917
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
|
16115
15918
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
16116
15919
|
return true;
|
|
@@ -16167,7 +15970,7 @@ struct llama_model * llama_load_model_from_file(
|
|
|
16167
15970
|
return true;
|
|
16168
15971
|
};
|
|
16169
15972
|
}
|
|
16170
|
-
if (params.rpc_servers != nullptr) {
|
|
15973
|
+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
|
16171
15974
|
// split the servers set them into model->rpc_servers
|
|
16172
15975
|
std::string servers(params.rpc_servers);
|
|
16173
15976
|
size_t pos = 0;
|
|
@@ -16221,6 +16024,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16221
16024
|
params.flash_attn = false;
|
|
16222
16025
|
}
|
|
16223
16026
|
|
|
16027
|
+
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
|
16028
|
+
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
|
16029
|
+
return nullptr;
|
|
16030
|
+
}
|
|
16031
|
+
|
|
16224
16032
|
llama_context * ctx = new llama_context(*model);
|
|
16225
16033
|
|
|
16226
16034
|
const auto & hparams = model->hparams;
|
|
@@ -16259,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16259
16067
|
|
|
16260
16068
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
16261
16069
|
|
|
16262
|
-
cparams.
|
|
16263
|
-
hparams.
|
|
16070
|
+
cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
|
16071
|
+
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
|
|
16264
16072
|
hparams.n_ctx_train;
|
|
16265
16073
|
|
|
16266
16074
|
cparams.cb_eval = params.cb_eval;
|
|
@@ -16325,17 +16133,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16325
16133
|
|
|
16326
16134
|
if (!hparams.vocab_only) {
|
|
16327
16135
|
// initialize backends
|
|
16328
|
-
#if defined(
|
|
16329
|
-
for (auto & server : model->rpc_servers) {
|
|
16330
|
-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
16331
|
-
if (backend == nullptr) {
|
|
16332
|
-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
16333
|
-
llama_free(ctx);
|
|
16334
|
-
return nullptr;
|
|
16335
|
-
}
|
|
16336
|
-
ctx->backends.push_back(backend);
|
|
16337
|
-
}
|
|
16338
|
-
#elif defined(GGML_USE_METAL)
|
|
16136
|
+
#if defined(GGML_USE_METAL)
|
|
16339
16137
|
if (model->n_gpu_layers > 0) {
|
|
16340
16138
|
ctx->backend_metal = ggml_backend_metal_init();
|
|
16341
16139
|
if (ctx->backend_metal == nullptr) {
|
|
@@ -16374,7 +16172,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16374
16172
|
return nullptr;
|
|
16375
16173
|
}
|
|
16376
16174
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
16377
|
-
ggml_backend_t backend = ggml_backend_vk_init(
|
|
16175
|
+
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
|
|
16378
16176
|
if (backend == nullptr) {
|
|
16379
16177
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
|
16380
16178
|
llama_free(ctx);
|
|
@@ -16427,6 +16225,19 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16427
16225
|
}
|
|
16428
16226
|
ctx->backends.push_back(backend);
|
|
16429
16227
|
}
|
|
16228
|
+
#endif
|
|
16229
|
+
#if defined(GGML_USE_RPC)
|
|
16230
|
+
if (model->n_gpu_layers > 0) {
|
|
16231
|
+
for (const auto & endpoint : model->rpc_servers) {
|
|
16232
|
+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
|
16233
|
+
if (backend == nullptr) {
|
|
16234
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
|
16235
|
+
llama_free(ctx);
|
|
16236
|
+
return nullptr;
|
|
16237
|
+
}
|
|
16238
|
+
ctx->backends.push_back(backend);
|
|
16239
|
+
}
|
|
16240
|
+
}
|
|
16430
16241
|
#endif
|
|
16431
16242
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
|
16432
16243
|
if (ctx->backend_cpu == nullptr) {
|
|
@@ -18209,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
|
|
18209
18020
|
return model->vocab.id_to_token[token].score;
|
|
18210
18021
|
}
|
|
18211
18022
|
|
|
18212
|
-
|
|
18023
|
+
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
|
18213
18024
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
18214
|
-
return model->vocab.id_to_token[token].
|
|
18025
|
+
return model->vocab.id_to_token[token].attr;
|
|
18215
18026
|
}
|
|
18216
18027
|
|
|
18217
18028
|
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
@@ -18313,9 +18124,14 @@ static std::string llama_decode_text(const std::string & text) {
|
|
|
18313
18124
|
|
|
18314
18125
|
// does not write null-terminator to buf
|
|
18315
18126
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
|
18127
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
|
18128
|
+
if (!special && llama_is_control_token(model->vocab, token)) {
|
|
18129
|
+
return 0;
|
|
18130
|
+
}
|
|
18131
|
+
|
|
18316
18132
|
// if we have a cache - use it
|
|
18317
18133
|
{
|
|
18318
|
-
const auto & cache =
|
|
18134
|
+
const auto & cache = model->vocab.cache_token_to_piece;
|
|
18319
18135
|
|
|
18320
18136
|
if (!cache.empty()) {
|
|
18321
18137
|
const auto & res = cache.at(token);
|