llama_cpp 0.16.0 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
@@ -21,6 +21,10 @@
|
|
21
21
|
# include "ggml-kompute.h"
|
22
22
|
#endif
|
23
23
|
|
24
|
+
#ifdef GGML_USE_BLAS
|
25
|
+
# include "ggml-blas.h"
|
26
|
+
#endif
|
27
|
+
|
24
28
|
#ifdef GGML_USE_METAL
|
25
29
|
# include "ggml-metal.h"
|
26
30
|
#endif
|
@@ -282,6 +286,7 @@ enum llm_kv {
|
|
282
286
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
283
287
|
LLM_KV_FEED_FORWARD_LENGTH,
|
284
288
|
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
289
|
+
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
285
290
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
286
291
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
287
292
|
LLM_KV_EXPERT_COUNT,
|
@@ -360,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
360
365
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
361
366
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
362
367
|
|
363
|
-
{ LLM_KV_VOCAB_SIZE,
|
364
|
-
{ LLM_KV_CONTEXT_LENGTH,
|
365
|
-
{ LLM_KV_EMBEDDING_LENGTH,
|
366
|
-
{ LLM_KV_BLOCK_COUNT,
|
367
|
-
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
368
|
-
{ LLM_KV_FEED_FORWARD_LENGTH,
|
369
|
-
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
370
|
-
{
|
371
|
-
{
|
372
|
-
{
|
373
|
-
{
|
374
|
-
{
|
375
|
-
{
|
376
|
-
{
|
377
|
-
{
|
368
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
369
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
370
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
371
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
372
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
373
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
374
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
375
|
+
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
376
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
377
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
378
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
379
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
380
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
381
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
382
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
383
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
378
384
|
|
379
385
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
380
386
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -704,6 +710,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
704
710
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
705
711
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
706
712
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
713
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
707
714
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
708
715
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
709
716
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
@@ -1273,6 +1280,126 @@ struct no_init {
|
|
1273
1280
|
};
|
1274
1281
|
|
1275
1282
|
struct llama_file {
|
1283
|
+
|
1284
|
+
#if defined(_WIN32)
|
1285
|
+
// use FILE * so we don't have to re-open the file to mmap
|
1286
|
+
FILE * fp;
|
1287
|
+
HANDLE fp_win32;
|
1288
|
+
size_t size;
|
1289
|
+
|
1290
|
+
private:
|
1291
|
+
std::string GetErrorMessageWin32(DWORD error_code) const {
|
1292
|
+
std::string ret;
|
1293
|
+
LPSTR lpMsgBuf = NULL;
|
1294
|
+
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
1295
|
+
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
1296
|
+
if (!bufLen) {
|
1297
|
+
ret = format("Win32 error code: %s", error_code);
|
1298
|
+
} else {
|
1299
|
+
ret = lpMsgBuf;
|
1300
|
+
LocalFree(lpMsgBuf);
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
return ret;
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
public:
|
1307
|
+
|
1308
|
+
llama_file(const char * fname, const char * mode) {
|
1309
|
+
fp = ggml_fopen(fname, mode);
|
1310
|
+
if (fp == NULL) {
|
1311
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1312
|
+
}
|
1313
|
+
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
1314
|
+
seek(0, SEEK_END);
|
1315
|
+
size = tell();
|
1316
|
+
seek(0, SEEK_SET);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
size_t tell() const {
|
1320
|
+
// SetFilePointerEx returns the current position when seeking relative 0 bytes
|
1321
|
+
LARGE_INTEGER li;
|
1322
|
+
li.QuadPart = 0;
|
1323
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
|
1324
|
+
if (!ret) {
|
1325
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1326
|
+
}
|
1327
|
+
|
1328
|
+
return li.QuadPart;
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
void seek(size_t offset, int whence) const {
|
1332
|
+
// no need to convert SEEK_* to FILE_*. The enums are the same.
|
1333
|
+
// Still, keep static asserts to avoid failures in the future.
|
1334
|
+
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
|
1335
|
+
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
|
1336
|
+
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
|
1337
|
+
|
1338
|
+
LARGE_INTEGER li;
|
1339
|
+
li.QuadPart = offset;
|
1340
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
|
1341
|
+
if (!ret) {
|
1342
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1343
|
+
}
|
1344
|
+
}
|
1345
|
+
|
1346
|
+
void read_raw(void * ptr, size_t len) const {
|
1347
|
+
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
|
1348
|
+
// use the Win32 API to do file io instead of the C/C++ library functions.
|
1349
|
+
|
1350
|
+
// There are conditions under which ReadFile cannot read chunks >64MB.
|
1351
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
1352
|
+
size_t bytes_read = 0;
|
1353
|
+
while (bytes_read < len) {
|
1354
|
+
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
1355
|
+
DWORD chunk_read = 0;
|
1356
|
+
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
|
1357
|
+
if (!result) {
|
1358
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1359
|
+
}
|
1360
|
+
if (chunk_read < chunk_size || chunk_read == 0) {
|
1361
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
1362
|
+
}
|
1363
|
+
|
1364
|
+
bytes_read += chunk_read;
|
1365
|
+
} ;
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
uint32_t read_u32() const {
|
1369
|
+
uint32_t val;
|
1370
|
+
read_raw(&val, sizeof(val));
|
1371
|
+
return val;
|
1372
|
+
}
|
1373
|
+
|
1374
|
+
void write_raw(const void * ptr, size_t len) const {
|
1375
|
+
// There are conditions under which WriteFile cannot write chunks >64MB.
|
1376
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
1377
|
+
size_t bytes_written = 0;
|
1378
|
+
while (bytes_written < len) {
|
1379
|
+
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
|
1380
|
+
DWORD chunk_written = 0;
|
1381
|
+
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
|
1382
|
+
if (!result) {
|
1383
|
+
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1384
|
+
}
|
1385
|
+
if (chunk_written < chunk_size || chunk_written == 0) {
|
1386
|
+
throw std::runtime_error("unexpectedly failed to write bytes");
|
1387
|
+
}
|
1388
|
+
|
1389
|
+
bytes_written += chunk_written;
|
1390
|
+
}
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
void write_u32(std::uint32_t val) const {
|
1394
|
+
write_raw(&val, sizeof(val));
|
1395
|
+
}
|
1396
|
+
|
1397
|
+
~llama_file() {
|
1398
|
+
if (fp) {
|
1399
|
+
std::fclose(fp);
|
1400
|
+
}
|
1401
|
+
}
|
1402
|
+
#else
|
1276
1403
|
// use FILE * so we don't have to re-open the file to mmap
|
1277
1404
|
FILE * fp;
|
1278
1405
|
size_t size;
|
@@ -1293,7 +1420,10 @@ struct llama_file {
|
|
1293
1420
|
#else
|
1294
1421
|
long ret = std::ftell(fp);
|
1295
1422
|
#endif
|
1296
|
-
|
1423
|
+
if (ret == -1) {
|
1424
|
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
1425
|
+
}
|
1426
|
+
|
1297
1427
|
return (size_t) ret;
|
1298
1428
|
}
|
1299
1429
|
|
@@ -1303,7 +1433,9 @@ struct llama_file {
|
|
1303
1433
|
#else
|
1304
1434
|
int ret = std::fseek(fp, (long) offset, whence);
|
1305
1435
|
#endif
|
1306
|
-
|
1436
|
+
if (ret != 0) {
|
1437
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
1438
|
+
}
|
1307
1439
|
}
|
1308
1440
|
|
1309
1441
|
void read_raw(void * ptr, size_t len) const {
|
@@ -1346,6 +1478,7 @@ struct llama_file {
|
|
1346
1478
|
std::fclose(fp);
|
1347
1479
|
}
|
1348
1480
|
}
|
1481
|
+
#endif
|
1349
1482
|
};
|
1350
1483
|
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1351
1484
|
|
@@ -1839,6 +1972,7 @@ struct llama_hparams {
|
|
1839
1972
|
uint32_t n_lora_q = 0;
|
1840
1973
|
uint32_t n_lora_kv = 0;
|
1841
1974
|
uint32_t n_ff_exp = 0;
|
1975
|
+
uint32_t n_ff_shexp = 0;
|
1842
1976
|
uint32_t n_expert_shared = 0;
|
1843
1977
|
float expert_weights_scale = 0.0;
|
1844
1978
|
|
@@ -1887,6 +2021,7 @@ struct llama_hparams {
|
|
1887
2021
|
if (this->n_lora_q != other.n_lora_q) return true;
|
1888
2022
|
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1889
2023
|
if (this->n_ff_exp != other.n_ff_exp) return true;
|
2024
|
+
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
1890
2025
|
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1891
2026
|
|
1892
2027
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
@@ -2158,6 +2293,8 @@ struct llama_vocab {
|
|
2158
2293
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2159
2294
|
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2160
2295
|
|
2296
|
+
int max_token_len = 0; // used for optimizing longest token search
|
2297
|
+
|
2161
2298
|
std::unordered_map<token, id> token_to_id;
|
2162
2299
|
std::vector<token_data> id_to_token;
|
2163
2300
|
|
@@ -2175,16 +2312,17 @@ struct llama_vocab {
|
|
2175
2312
|
id special_cls_id = -1;
|
2176
2313
|
id special_mask_id = -1;
|
2177
2314
|
|
2178
|
-
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
2179
|
-
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2180
|
-
|
2181
2315
|
id linefeed_id = 13;
|
2182
2316
|
id special_prefix_id = -1;
|
2183
2317
|
id special_suffix_id = -1;
|
2184
2318
|
id special_middle_id = -1;
|
2185
2319
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2186
2320
|
|
2187
|
-
|
2321
|
+
// tokenizer flags
|
2322
|
+
bool tokenizer_add_space_prefix = true;
|
2323
|
+
bool tokenizer_add_bos = false;
|
2324
|
+
bool tokenizer_add_eos = false;
|
2325
|
+
bool tokenizer_ignore_merges = false;
|
2188
2326
|
|
2189
2327
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
2190
2328
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
@@ -2298,9 +2436,13 @@ struct llama_context {
|
|
2298
2436
|
std::vector<ggml_backend_t> backends;
|
2299
2437
|
#ifdef GGML_USE_METAL
|
2300
2438
|
ggml_backend_t backend_metal = nullptr;
|
2439
|
+
#endif
|
2440
|
+
#ifdef GGML_USE_BLAS
|
2441
|
+
ggml_backend_t backend_blas = nullptr;
|
2301
2442
|
#endif
|
2302
2443
|
ggml_backend_t backend_cpu = nullptr;
|
2303
2444
|
|
2445
|
+
|
2304
2446
|
const llama_model & model;
|
2305
2447
|
|
2306
2448
|
// key + value cache for the self attention
|
@@ -3712,6 +3854,44 @@ struct llama_model_loader {
|
|
3712
3854
|
std::vector<no_init<uint8_t>> read_buf;
|
3713
3855
|
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3714
3856
|
|
3857
|
+
#if defined(GGML_USE_CUDA)
|
3858
|
+
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
3859
|
+
// NVMe raid configurations might require more / larger buffers.
|
3860
|
+
constexpr size_t num_buffers = 4;
|
3861
|
+
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
3862
|
+
|
3863
|
+
std::vector<ggml_backend_buffer_t> host_buffers;
|
3864
|
+
std::vector<void*> host_ptrs;
|
3865
|
+
std::vector<ggml_backend_event_t> events;
|
3866
|
+
size_t buffer_idx = 0; // buffer to use for async loads
|
3867
|
+
|
3868
|
+
ggml_backend_t cuda_backend = nullptr;
|
3869
|
+
if (!use_mmap && !check_tensors) {
|
3870
|
+
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
3871
|
+
// First determine if the CUDA backend is active, and if so, determine the device ID.
|
3872
|
+
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
|
3873
|
+
if (buf) {
|
3874
|
+
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
|
3875
|
+
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
3876
|
+
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
3877
|
+
if (buffer_type == cuda_buffer_type) {
|
3878
|
+
cuda_backend = ggml_backend_cuda_init(i);
|
3879
|
+
break;
|
3880
|
+
}
|
3881
|
+
}
|
3882
|
+
}
|
3883
|
+
|
3884
|
+
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
|
3885
|
+
if (cuda_backend) {
|
3886
|
+
for (size_t idx = 0; idx < num_buffers; ++idx) {
|
3887
|
+
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
3888
|
+
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
3889
|
+
events.emplace_back(ggml_backend_event_new(cuda_backend));
|
3890
|
+
}
|
3891
|
+
}
|
3892
|
+
}
|
3893
|
+
#endif
|
3894
|
+
|
3715
3895
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3716
3896
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3717
3897
|
if (weight == nullptr) {
|
@@ -3767,12 +3947,36 @@ struct llama_model_loader {
|
|
3767
3947
|
}));
|
3768
3948
|
}
|
3769
3949
|
} else {
|
3770
|
-
|
3771
|
-
|
3772
|
-
|
3773
|
-
|
3774
|
-
|
3775
|
-
|
3950
|
+
#if defined(GGML_USE_CUDA)
|
3951
|
+
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
3952
|
+
if (cuda_backend) {
|
3953
|
+
file->seek(weight->offs, SEEK_SET);
|
3954
|
+
|
3955
|
+
size_t bytes_read = 0;
|
3956
|
+
|
3957
|
+
while (bytes_read < n_size) {
|
3958
|
+
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
3959
|
+
|
3960
|
+
ggml_backend_event_synchronize(events[buffer_idx]);
|
3961
|
+
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
3962
|
+
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
3963
|
+
ggml_backend_event_record(events[buffer_idx]);
|
3964
|
+
|
3965
|
+
bytes_read += read_iteration;
|
3966
|
+
++buffer_idx;
|
3967
|
+
buffer_idx %= num_buffers;
|
3968
|
+
}
|
3969
|
+
}
|
3970
|
+
else
|
3971
|
+
#endif
|
3972
|
+
{
|
3973
|
+
read_buf.resize(n_size);
|
3974
|
+
file->seek(weight->offs, SEEK_SET);
|
3975
|
+
file->read_raw(read_buf.data(), n_size);
|
3976
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3977
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3978
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3979
|
+
}
|
3776
3980
|
}
|
3777
3981
|
}
|
3778
3982
|
}
|
@@ -3780,6 +3984,18 @@ struct llama_model_loader {
|
|
3780
3984
|
size_done += n_size;
|
3781
3985
|
}
|
3782
3986
|
|
3987
|
+
#if defined(GGML_USE_CUDA)
|
3988
|
+
// free temporary resources used for async cuda uploads
|
3989
|
+
if (cuda_backend) {
|
3990
|
+
for (size_t idx = 0; idx < num_buffers;++idx) {
|
3991
|
+
ggml_backend_event_synchronize(events[idx]);
|
3992
|
+
ggml_backend_event_free(events[idx]);
|
3993
|
+
ggml_backend_buffer_free(host_buffers[idx]);
|
3994
|
+
}
|
3995
|
+
ggml_backend_free(cuda_backend);
|
3996
|
+
}
|
3997
|
+
#endif
|
3998
|
+
|
3783
3999
|
// check validation results
|
3784
4000
|
bool validation_failed = false;
|
3785
4001
|
for (auto & future : validation_result) {
|
@@ -4246,6 +4462,9 @@ static void llm_load_hparams(
|
|
4246
4462
|
} break;
|
4247
4463
|
case LLM_ARCH_QWEN2MOE:
|
4248
4464
|
{
|
4465
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
4466
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
4467
|
+
|
4249
4468
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4250
4469
|
switch (hparams.n_layer) {
|
4251
4470
|
case 24: model.type = e_model::MODEL_A2_7B; break;
|
@@ -4552,38 +4771,9 @@ static void llm_load_vocab(
|
|
4552
4771
|
vocab.special_cls_id = -1;
|
4553
4772
|
vocab.special_mask_id = -1;
|
4554
4773
|
|
4555
|
-
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4556
|
-
// prior to support of FIM special tokens in GGUF, the following
|
4557
|
-
// will allow those models to continue to work. The general names
|
4558
|
-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4559
|
-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4560
|
-
// new versions of these models have been published.
|
4561
|
-
std::string gen_name;
|
4562
|
-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4563
|
-
|
4564
|
-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4565
|
-
[](unsigned char c){ return std::tolower(c); });
|
4566
|
-
|
4567
|
-
if (gen_name.find("code") != std::string::npos) {
|
4568
|
-
if (model.arch == LLM_ARCH_LLAMA) {
|
4569
|
-
vocab.special_prefix_id = 32007;
|
4570
|
-
vocab.special_suffix_id = 32008;
|
4571
|
-
vocab.special_middle_id = 32009;
|
4572
|
-
vocab.special_eot_id = 32010;
|
4573
|
-
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4574
|
-
vocab.special_prefix_id = 67;
|
4575
|
-
vocab.special_suffix_id = 69;
|
4576
|
-
vocab.special_middle_id = 68;
|
4577
|
-
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4578
|
-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4579
|
-
//vocab.special_eot_id = 70;
|
4580
|
-
vocab.special_eot_id = 107;
|
4581
|
-
}
|
4582
|
-
}
|
4583
|
-
|
4584
4774
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4585
4775
|
if (add_space_prefix_keyidx != -1) {
|
4586
|
-
vocab.
|
4776
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4587
4777
|
} // The default value of add_space_prefix is true.
|
4588
4778
|
} else if (tokenizer_model == "bert") {
|
4589
4779
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
@@ -4596,13 +4786,13 @@ static void llm_load_vocab(
|
|
4596
4786
|
vocab.special_pad_id = 0;
|
4597
4787
|
vocab.special_cls_id = 101;
|
4598
4788
|
vocab.special_mask_id = 103;
|
4599
|
-
vocab.
|
4789
|
+
vocab.tokenizer_add_space_prefix = false;
|
4600
4790
|
} else if (tokenizer_model == "gpt2") {
|
4601
4791
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4602
4792
|
|
4603
4793
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4604
4794
|
if (add_space_prefix_keyidx != -1) {
|
4605
|
-
vocab.
|
4795
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4606
4796
|
}
|
4607
4797
|
|
4608
4798
|
// read bpe merges and populate bpe ranks
|
@@ -4653,14 +4843,15 @@ static void llm_load_vocab(
|
|
4653
4843
|
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4654
4844
|
LLAMA_LOG_WARN("%s: \n", __func__);
|
4655
4845
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4656
|
-
} else if (
|
4657
|
-
tokenizer_pre == "default") {
|
4846
|
+
} else if (tokenizer_pre == "default") {
|
4658
4847
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4659
4848
|
} else if (
|
4660
4849
|
tokenizer_pre == "llama3" ||
|
4661
4850
|
tokenizer_pre == "llama-v3" ||
|
4662
4851
|
tokenizer_pre == "llama-bpe") {
|
4663
4852
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4853
|
+
vocab.tokenizer_ignore_merges = true;
|
4854
|
+
vocab.tokenizer_add_bos = true;
|
4664
4855
|
} else if (
|
4665
4856
|
tokenizer_pre == "deepseek-llm") {
|
4666
4857
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
@@ -4681,7 +4872,8 @@ static void llm_load_vocab(
|
|
4681
4872
|
tokenizer_pre == "jina-es" ||
|
4682
4873
|
tokenizer_pre == "jina-de" ||
|
4683
4874
|
tokenizer_pre == "jina-v2-es" ||
|
4684
|
-
tokenizer_pre == "jina-v2-de"
|
4875
|
+
tokenizer_pre == "jina-v2-de" ||
|
4876
|
+
tokenizer_pre == "jina-v2-code") {
|
4685
4877
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4686
4878
|
} else if (
|
4687
4879
|
tokenizer_pre == "refact") {
|
@@ -4704,9 +4896,20 @@ static void llm_load_vocab(
|
|
4704
4896
|
} else if (
|
4705
4897
|
tokenizer_pre == "smaug-bpe") {
|
4706
4898
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4899
|
+
} else if (
|
4900
|
+
tokenizer_pre == "poro-chat") {
|
4901
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
4707
4902
|
} else {
|
4708
4903
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4709
4904
|
}
|
4905
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
4906
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4907
|
+
vocab.tokenizer_add_bos = true;
|
4908
|
+
vocab.tokenizer_add_eos = false;
|
4909
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
4910
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4911
|
+
vocab.tokenizer_add_bos = true;
|
4912
|
+
vocab.tokenizer_add_eos = false;
|
4710
4913
|
} else {
|
4711
4914
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4712
4915
|
}
|
@@ -4738,6 +4941,7 @@ static void llm_load_vocab(
|
|
4738
4941
|
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
4739
4942
|
|
4740
4943
|
vocab.token_to_id[word] = i;
|
4944
|
+
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
4741
4945
|
|
4742
4946
|
auto & token_data = vocab.id_to_token[i];
|
4743
4947
|
token_data.text = std::move(word);
|
@@ -4761,6 +4965,45 @@ static void llm_load_vocab(
|
|
4761
4965
|
|
4762
4966
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
4763
4967
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
4968
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4969
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4970
|
+
// will allow those models to continue to work. The general names
|
4971
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4972
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4973
|
+
// new versions of these models have been published.
|
4974
|
+
std::string gen_name;
|
4975
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4976
|
+
|
4977
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4978
|
+
[](unsigned char c){ return std::tolower(c); });
|
4979
|
+
|
4980
|
+
if (gen_name.find("code") != std::string::npos) {
|
4981
|
+
if (model.arch == LLM_ARCH_LLAMA
|
4982
|
+
&& 32010 < vocab.id_to_token.size()
|
4983
|
+
&& vocab.id_to_token[32007].text == "<PRE>"
|
4984
|
+
&& vocab.id_to_token[32008].text == "<SUF>"
|
4985
|
+
&& vocab.id_to_token[32009].text == "<MID>"
|
4986
|
+
&& vocab.id_to_token[32010].text == "<EOT>") {
|
4987
|
+
vocab.special_prefix_id = 32007;
|
4988
|
+
vocab.special_suffix_id = 32008;
|
4989
|
+
vocab.special_middle_id = 32009;
|
4990
|
+
vocab.special_eot_id = 32010;
|
4991
|
+
} else if (model.arch == LLM_ARCH_GEMMA
|
4992
|
+
&& 107 < vocab.id_to_token.size()
|
4993
|
+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
4994
|
+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
4995
|
+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
4996
|
+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
4997
|
+
vocab.special_prefix_id = 67;
|
4998
|
+
vocab.special_suffix_id = 69;
|
4999
|
+
vocab.special_middle_id = 68;
|
5000
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
5001
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
5002
|
+
//vocab.special_eot_id = 70;
|
5003
|
+
vocab.special_eot_id = 107;
|
5004
|
+
}
|
5005
|
+
}
|
5006
|
+
|
4764
5007
|
try {
|
4765
5008
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
4766
5009
|
} catch (const std::exception & e) {
|
@@ -4812,10 +5055,10 @@ static void llm_load_vocab(
|
|
4812
5055
|
bool temp = true;
|
4813
5056
|
|
4814
5057
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
4815
|
-
vocab.
|
5058
|
+
vocab.tokenizer_add_bos = temp;
|
4816
5059
|
}
|
4817
5060
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
4818
|
-
vocab.
|
5061
|
+
vocab.tokenizer_add_eos = temp;
|
4819
5062
|
}
|
4820
5063
|
}
|
4821
5064
|
|
@@ -4915,7 +5158,7 @@ static void llm_load_vocab(
|
|
4915
5158
|
);
|
4916
5159
|
|
4917
5160
|
// set attributes by model/tokenizer name
|
4918
|
-
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-
|
5161
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
|
4919
5162
|
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
4920
5163
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
4921
5164
|
for (auto id : vocab.cache_special_tokens) {
|
@@ -5009,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
5009
5252
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
5010
5253
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
5011
5254
|
|
5255
|
+
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
5256
|
+
|
5012
5257
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
5013
5258
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
5014
5259
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
@@ -5018,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
5018
5263
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
5019
5264
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
5020
5265
|
}
|
5266
|
+
|
5267
|
+
if (model.arch == LLM_ARCH_QWEN2MOE) {
|
5268
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
5269
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
5270
|
+
}
|
5021
5271
|
}
|
5022
5272
|
|
5023
5273
|
// Returns false if cancelled by progress_callback
|
@@ -5161,7 +5411,7 @@ static bool llm_load_tensors(
|
|
5161
5411
|
// create tensors for the weights
|
5162
5412
|
{
|
5163
5413
|
const int64_t n_embd = hparams.n_embd;
|
5164
|
-
const int64_t n_embd_head = n_embd / hparams.n_head;
|
5414
|
+
const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
|
5165
5415
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
5166
5416
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
5167
5417
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -5515,7 +5765,7 @@ static bool llm_load_tensors(
|
|
5515
5765
|
|
5516
5766
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5517
5767
|
} else {
|
5518
|
-
layer.ffn_gate
|
5768
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5519
5769
|
}
|
5520
5770
|
|
5521
5771
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
@@ -5556,6 +5806,9 @@ static bool llm_load_tensors(
|
|
5556
5806
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
5557
5807
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
5558
5808
|
|
5809
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5810
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5811
|
+
|
5559
5812
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5560
5813
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5561
5814
|
|
@@ -5801,16 +6054,17 @@ static bool llm_load_tensors(
|
|
5801
6054
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
5802
6055
|
|
5803
6056
|
// MoE branch
|
5804
|
-
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
6057
|
+
auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
|
5805
6058
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5806
6059
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5807
6060
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5808
6061
|
|
5809
6062
|
// Shared expert branch
|
6063
|
+
auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
5810
6064
|
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5811
|
-
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,
|
5812
|
-
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {
|
5813
|
-
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd,
|
6065
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
6066
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
|
6067
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
5814
6068
|
}
|
5815
6069
|
} break;
|
5816
6070
|
case LLM_ARCH_PHI2:
|
@@ -6600,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
6600
6854
|
}
|
6601
6855
|
#endif
|
6602
6856
|
|
6603
|
-
#ifdef GGML_USE_SYCL
|
6604
|
-
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
6605
|
-
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
6606
|
-
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
6607
|
-
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
6608
|
-
} else {
|
6609
|
-
ggml_backend_sycl_set_mul_device_mode();
|
6610
|
-
}
|
6611
|
-
#endif
|
6612
|
-
|
6613
6857
|
if (!llm_load_tensors(
|
6614
6858
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
6615
6859
|
params.progress_callback, params.progress_callback_user_data
|
@@ -7410,6 +7654,50 @@ struct llm_build_context {
|
|
7410
7654
|
return lctx.inp_s_seq;
|
7411
7655
|
}
|
7412
7656
|
|
7657
|
+
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
7658
|
+
// find result_norm tensor for input
|
7659
|
+
struct ggml_tensor * inp = nullptr;
|
7660
|
+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
7661
|
+
inp = gf->nodes[i];
|
7662
|
+
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
7663
|
+
break;
|
7664
|
+
} else {
|
7665
|
+
inp = nullptr;
|
7666
|
+
}
|
7667
|
+
}
|
7668
|
+
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
7669
|
+
|
7670
|
+
struct ggml_tensor * cur;
|
7671
|
+
|
7672
|
+
switch (pooling_type) {
|
7673
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
7674
|
+
{
|
7675
|
+
struct ggml_tensor * inp_mean = build_inp_mean();
|
7676
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
7677
|
+
} break;
|
7678
|
+
case LLAMA_POOLING_TYPE_CLS:
|
7679
|
+
case LLAMA_POOLING_TYPE_LAST:
|
7680
|
+
{
|
7681
|
+
struct ggml_tensor * inp_cls = build_inp_cls();
|
7682
|
+
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
7683
|
+
} break;
|
7684
|
+
case LLAMA_POOLING_TYPE_NONE:
|
7685
|
+
{
|
7686
|
+
cur = inp;
|
7687
|
+
} break;
|
7688
|
+
default:
|
7689
|
+
{
|
7690
|
+
GGML_ASSERT(false && "unknown pooling type");
|
7691
|
+
} break;
|
7692
|
+
}
|
7693
|
+
|
7694
|
+
cb(cur, "result_embd_pooled", -1);
|
7695
|
+
|
7696
|
+
ggml_build_forward_expand(gf, cur);
|
7697
|
+
|
7698
|
+
return gf;
|
7699
|
+
}
|
7700
|
+
|
7413
7701
|
struct ggml_cgraph * build_llama() {
|
7414
7702
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7415
7703
|
|
@@ -8390,8 +8678,6 @@ struct llm_build_context {
|
|
8390
8678
|
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8391
8679
|
inp_pos = build_inp_pos();
|
8392
8680
|
}
|
8393
|
-
struct ggml_tensor * inp_mean = build_inp_mean();
|
8394
|
-
struct ggml_tensor * inp_cls = build_inp_cls();
|
8395
8681
|
|
8396
8682
|
// construct input embeddings (token, type, position)
|
8397
8683
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -8519,6 +8805,11 @@ struct llm_build_context {
|
|
8519
8805
|
// attention layer norm
|
8520
8806
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
8521
8807
|
|
8808
|
+
if (model.layers[il].attn_norm_2 != nullptr) {
|
8809
|
+
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
|
8810
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
|
8811
|
+
}
|
8812
|
+
|
8522
8813
|
struct ggml_tensor * ffn_inp = cur;
|
8523
8814
|
cb(ffn_inp, "ffn_inp", il);
|
8524
8815
|
|
@@ -8561,28 +8852,6 @@ struct llm_build_context {
|
|
8561
8852
|
cur = inpL;
|
8562
8853
|
cb(cur, "result_embd", -1);
|
8563
8854
|
|
8564
|
-
// pooling layer
|
8565
|
-
switch (pooling_type) {
|
8566
|
-
case LLAMA_POOLING_TYPE_NONE:
|
8567
|
-
{
|
8568
|
-
// nop
|
8569
|
-
} break;
|
8570
|
-
case LLAMA_POOLING_TYPE_MEAN:
|
8571
|
-
{
|
8572
|
-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
8573
|
-
cb(cur, "result_embd_pooled", -1);
|
8574
|
-
} break;
|
8575
|
-
case LLAMA_POOLING_TYPE_CLS:
|
8576
|
-
{
|
8577
|
-
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
8578
|
-
cb(cur, "result_embd_pooled", -1);
|
8579
|
-
} break;
|
8580
|
-
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
8581
|
-
{
|
8582
|
-
GGML_ASSERT(false && "Invalid pooling type");
|
8583
|
-
} break;
|
8584
|
-
}
|
8585
|
-
|
8586
8855
|
ggml_build_forward_expand(gf, cur);
|
8587
8856
|
|
8588
8857
|
return gf;
|
@@ -11520,7 +11789,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11520
11789
|
if (batch.n_tokens < 32 || full_offload) {
|
11521
11790
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
11522
11791
|
for (auto * backend : lctx.backends) {
|
11523
|
-
if (
|
11792
|
+
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
11793
|
+
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
11524
11794
|
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
11525
11795
|
break;
|
11526
11796
|
}
|
@@ -11666,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11666
11936
|
GGML_ASSERT(false);
|
11667
11937
|
}
|
11668
11938
|
|
11939
|
+
// add on pooling layer
|
11940
|
+
if (lctx.cparams.embeddings) {
|
11941
|
+
result = llm.append_pooling(result);
|
11942
|
+
}
|
11943
|
+
|
11669
11944
|
llm.free();
|
11670
11945
|
|
11671
11946
|
return result;
|
@@ -11755,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11755
12030
|
// (!a || b) is a logical implication (a -> b)
|
11756
12031
|
// !hparams.causal_attn -> !cparams.causal_attn
|
11757
12032
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
11758
|
-
"causal attention
|
12033
|
+
"causal attention is not supported by this model"
|
11759
12034
|
);
|
11760
12035
|
|
11761
12036
|
if (lctx.inp_KQ_mask) {
|
@@ -11887,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11887
12162
|
}
|
11888
12163
|
}
|
11889
12164
|
|
12165
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
12166
|
+
const int64_t n_tokens = batch.n_tokens;
|
12167
|
+
|
12168
|
+
GGML_ASSERT(lctx.inp_cls);
|
12169
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
12170
|
+
|
12171
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
12172
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
12173
|
+
|
12174
|
+
std::vector<int> last_pos(n_tokens, -1);
|
12175
|
+
std::vector<int> last_row(n_tokens, -1);
|
12176
|
+
|
12177
|
+
for (int i = 0; i < n_tokens; ++i) {
|
12178
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
12179
|
+
const llama_pos pos = batch.pos[i];
|
12180
|
+
|
12181
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
|
12182
|
+
|
12183
|
+
if (pos >= last_pos[seq_id]) {
|
12184
|
+
last_pos[seq_id] = pos;
|
12185
|
+
last_row[seq_id] = i;
|
12186
|
+
}
|
12187
|
+
}
|
12188
|
+
|
12189
|
+
for (int i = 0; i < n_tokens; ++i) {
|
12190
|
+
if (last_row[i] >= 0) {
|
12191
|
+
data[i] = last_row[i];
|
12192
|
+
}
|
12193
|
+
}
|
12194
|
+
}
|
12195
|
+
|
11890
12196
|
if (kv_self.recurrent) {
|
11891
12197
|
const int64_t n_kv = kv_self.n;
|
11892
12198
|
|
@@ -11948,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
11948
12254
|
const auto n_embd = hparams.n_embd;
|
11949
12255
|
|
11950
12256
|
// TODO: use a per-batch flag for logits presence instead
|
11951
|
-
const bool has_logits = cparams.
|
11952
|
-
const bool has_embd =
|
12257
|
+
const bool has_logits = !cparams.embeddings;
|
12258
|
+
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
11953
12259
|
|
11954
12260
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
11955
12261
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
@@ -12017,6 +12323,11 @@ static void llama_graph_compute(
|
|
12017
12323
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
12018
12324
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
12019
12325
|
}
|
12326
|
+
#ifdef GGML_USE_BLAS
|
12327
|
+
if (lctx.backend_blas != nullptr) {
|
12328
|
+
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
|
12329
|
+
}
|
12330
|
+
#endif
|
12020
12331
|
|
12021
12332
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
12022
12333
|
|
@@ -12074,11 +12385,13 @@ static int llama_decode_internal(
|
|
12074
12385
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
12075
12386
|
|
12076
12387
|
// count outputs
|
12077
|
-
if (
|
12388
|
+
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
|
12389
|
+
n_outputs = n_tokens_all;
|
12390
|
+
} else if (batch_all.logits) {
|
12078
12391
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
12079
12392
|
n_outputs += batch_all.logits[i] != 0;
|
12080
12393
|
}
|
12081
|
-
} else if (lctx.logits_all
|
12394
|
+
} else if (lctx.logits_all) {
|
12082
12395
|
n_outputs = n_tokens_all;
|
12083
12396
|
} else {
|
12084
12397
|
// keep last output only
|
@@ -12209,47 +12522,19 @@ static int llama_decode_internal(
|
|
12209
12522
|
// no output
|
12210
12523
|
res = nullptr;
|
12211
12524
|
embd = nullptr;
|
12212
|
-
} else if (!hparams.causal_attn) {
|
12213
|
-
res = nullptr; // do not extract logits for embedding models such as BERT
|
12214
|
-
|
12215
|
-
// token or sequence embeddings
|
12216
|
-
embd = gf->nodes[gf->n_nodes - 1];
|
12217
|
-
|
12218
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
12219
12525
|
} else if (cparams.embeddings) {
|
12220
|
-
|
12221
|
-
|
12222
|
-
|
12223
|
-
|
12224
|
-
if (i_embd < 0) { break; }
|
12225
|
-
embd = gf->nodes[i_embd];
|
12226
|
-
}
|
12227
|
-
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
12228
|
-
|
12229
|
-
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
12230
|
-
if (!cparams.causal_attn) {
|
12231
|
-
res = nullptr; // do not extract logits when not needed
|
12232
|
-
// skip computing logits
|
12233
|
-
// TODO: is this safe?
|
12234
|
-
gf->n_nodes = i_embd + 1;
|
12526
|
+
res = nullptr; // do not extract logits for embedding case
|
12527
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
12528
|
+
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
12529
|
+
embd = gf->nodes[gf->n_nodes - 2];
|
12235
12530
|
}
|
12531
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
12236
12532
|
} else {
|
12237
12533
|
embd = nullptr; // do not extract embeddings when not needed
|
12238
12534
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
12239
12535
|
}
|
12240
12536
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
12241
12537
|
|
12242
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
12243
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
12244
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
12245
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
12246
|
-
// with the BLAS calls. need a better solution
|
12247
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
12248
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
12249
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
12250
|
-
n_threads = std::min(4, n_threads);
|
12251
|
-
}
|
12252
|
-
|
12253
12538
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
12254
12539
|
|
12255
12540
|
llama_set_inputs(lctx, u_batch);
|
@@ -12312,11 +12597,10 @@ static int llama_decode_internal(
|
|
12312
12597
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
12313
12598
|
}
|
12314
12599
|
} break;
|
12315
|
-
case LLAMA_POOLING_TYPE_CLS:
|
12316
12600
|
case LLAMA_POOLING_TYPE_MEAN:
|
12601
|
+
case LLAMA_POOLING_TYPE_CLS:
|
12602
|
+
case LLAMA_POOLING_TYPE_LAST:
|
12317
12603
|
{
|
12318
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
12319
|
-
|
12320
12604
|
// extract sequence embeddings
|
12321
12605
|
auto & embd_seq_out = lctx.embd_seq;
|
12322
12606
|
embd_seq_out.clear();
|
@@ -12930,107 +13214,142 @@ struct llm_bigram_bpe {
|
|
12930
13214
|
};
|
12931
13215
|
|
12932
13216
|
struct llm_tokenizer_bpe {
|
12933
|
-
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
12934
|
-
|
12935
|
-
|
12936
|
-
|
12937
|
-
|
12938
|
-
|
12939
|
-
|
12940
|
-
|
12941
|
-
|
12942
|
-
|
12943
|
-
|
12944
|
-
|
12945
|
-
|
12946
|
-
|
12947
|
-
|
12948
|
-
|
12949
|
-
|
12950
|
-
|
12951
|
-
|
12952
|
-
|
12953
|
-
|
12954
|
-
|
12955
|
-
|
12956
|
-
|
12957
|
-
|
12958
|
-
|
12959
|
-
|
12960
|
-
|
12961
|
-
|
12962
|
-
|
12963
|
-
|
12964
|
-
|
12965
|
-
|
12966
|
-
|
12967
|
-
|
12968
|
-
|
12969
|
-
|
12970
|
-
|
12971
|
-
|
12972
|
-
|
12973
|
-
|
12974
|
-
|
12975
|
-
|
12976
|
-
|
12977
|
-
|
12978
|
-
|
12979
|
-
|
12980
|
-
|
12981
|
-
|
12982
|
-
|
12983
|
-
|
12984
|
-
|
12985
|
-
|
12986
|
-
|
12987
|
-
|
12988
|
-
|
12989
|
-
|
12990
|
-
|
12991
|
-
|
12992
|
-
|
12993
|
-
|
12994
|
-
|
12995
|
-
|
12996
|
-
|
12997
|
-
|
12998
|
-
|
12999
|
-
|
13000
|
-
|
13001
|
-
|
13002
|
-
|
13003
|
-
|
13004
|
-
|
13005
|
-
|
13006
|
-
|
13007
|
-
|
13008
|
-
|
13009
|
-
|
13010
|
-
|
13011
|
-
|
13012
|
-
|
13013
|
-
|
13014
|
-
|
13015
|
-
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
13016
|
-
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13017
|
-
});
|
13018
|
-
break;
|
13019
|
-
default:
|
13020
|
-
// default regex for BPE tokenization pre-processing
|
13021
|
-
word_collection = unicode_regex_split(text, {
|
13022
|
-
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
13023
|
-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13024
|
-
"\\p{N}+",
|
13025
|
-
"[0-9][0-9][0-9]",
|
13026
|
-
});
|
13027
|
-
break;
|
13028
|
-
}
|
13217
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
13218
|
+
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
|
13219
|
+
switch (vocab.type_pre) {
|
13220
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
13221
|
+
regex_exprs = {
|
13222
|
+
// original regex from tokenizer.json
|
13223
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13224
|
+
|
13225
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
13226
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13227
|
+
};
|
13228
|
+
break;
|
13229
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
13230
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
13231
|
+
regex_exprs = {
|
13232
|
+
// same as llama3
|
13233
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13234
|
+
};
|
13235
|
+
break;
|
13236
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
13237
|
+
regex_exprs = {
|
13238
|
+
"[\r\n]",
|
13239
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
13240
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
13241
|
+
"\\s+$",
|
13242
|
+
"[一-龥ࠀ-一가-]+",
|
13243
|
+
"\\p{N}+",
|
13244
|
+
};
|
13245
|
+
break;
|
13246
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
13247
|
+
regex_exprs = {
|
13248
|
+
"[\r\n]",
|
13249
|
+
"\\s?\\p{L}+",
|
13250
|
+
"\\s?\\p{P}+",
|
13251
|
+
"[一-龥ࠀ-一가-]+",
|
13252
|
+
"\\p{N}",
|
13253
|
+
};
|
13254
|
+
break;
|
13255
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
13256
|
+
regex_exprs = {
|
13257
|
+
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
|
13258
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13259
|
+
"[0-9][0-9][0-9]",
|
13260
|
+
};
|
13261
|
+
break;
|
13262
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
13263
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
13264
|
+
// the following are close, but not exact. run the following:
|
13265
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
13266
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
13267
|
+
regex_exprs = {
|
13268
|
+
"\\s?\\p{L}+",
|
13269
|
+
"\\s?\\p{P}+",
|
13270
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13271
|
+
};
|
13272
|
+
break;
|
13273
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
13274
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
13275
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
13276
|
+
regex_exprs = {
|
13277
|
+
"\\p{N}",
|
13278
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13279
|
+
};
|
13280
|
+
break;
|
13281
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
13282
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
13283
|
+
regex_exprs = {
|
13284
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13285
|
+
};
|
13286
|
+
break;
|
13287
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
13288
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
13289
|
+
regex_exprs = {
|
13290
|
+
// original regex from tokenizer.json
|
13291
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
13292
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13293
|
+
};
|
13294
|
+
break;
|
13295
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
13296
|
+
regex_exprs = {
|
13297
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
13298
|
+
};
|
13029
13299
|
break;
|
13030
13300
|
default:
|
13031
|
-
|
13301
|
+
// default regex for BPE tokenization pre-processing
|
13302
|
+
regex_exprs = {
|
13303
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
13304
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13305
|
+
"\\p{N}+",
|
13306
|
+
"[0-9][0-9][0-9]",
|
13307
|
+
};
|
13032
13308
|
break;
|
13033
13309
|
}
|
13310
|
+
}
|
13311
|
+
|
13312
|
+
void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
|
13313
|
+
output.push_back(token_id);
|
13314
|
+
}
|
13315
|
+
|
13316
|
+
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
13317
|
+
if (vocab.tokenizer_add_bos) {
|
13318
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
13319
|
+
output.push_back(vocab.special_bos_id);
|
13320
|
+
return true;
|
13321
|
+
}
|
13322
|
+
return false;
|
13323
|
+
}
|
13324
|
+
|
13325
|
+
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
13326
|
+
if (vocab.tokenizer_add_eos) {
|
13327
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
13328
|
+
output.push_back(vocab.special_eos_id);
|
13329
|
+
return true;
|
13330
|
+
}
|
13331
|
+
return false;
|
13332
|
+
}
|
13333
|
+
|
13334
|
+
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
|
13335
|
+
if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13336
|
+
LLAMA_LOG_WARN(
|
13337
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13338
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13339
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13340
|
+
}
|
13341
|
+
if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
|
13342
|
+
LLAMA_LOG_WARN(
|
13343
|
+
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
|
13344
|
+
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
|
13345
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13346
|
+
}
|
13347
|
+
}
|
13348
|
+
|
13349
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
13350
|
+
int final_prev_index = -1;
|
13351
|
+
|
13352
|
+
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
13034
13353
|
|
13035
13354
|
symbols_final.clear();
|
13036
13355
|
|
@@ -13041,7 +13360,7 @@ struct llm_tokenizer_bpe {
|
|
13041
13360
|
int index = 0;
|
13042
13361
|
size_t offset = 0;
|
13043
13362
|
|
13044
|
-
if (
|
13363
|
+
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
13045
13364
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
13046
13365
|
offset = word.size();
|
13047
13366
|
}
|
@@ -13122,10 +13441,9 @@ struct llm_tokenizer_bpe {
|
|
13122
13441
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
13123
13442
|
std::string byte_str(1, *j);
|
13124
13443
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
13125
|
-
if (token_multibyte
|
13126
|
-
|
13444
|
+
if (token_multibyte != vocab.token_to_id.end()) {
|
13445
|
+
output.push_back(token_multibyte->second);
|
13127
13446
|
}
|
13128
|
-
output.push_back((*token_multibyte).second);
|
13129
13447
|
}
|
13130
13448
|
} else {
|
13131
13449
|
output.push_back((*token).second);
|
@@ -13164,6 +13482,8 @@ private:
|
|
13164
13482
|
|
13165
13483
|
const llama_vocab & vocab;
|
13166
13484
|
|
13485
|
+
std::vector<std::string> regex_exprs;
|
13486
|
+
|
13167
13487
|
std::vector<llm_symbol> symbols;
|
13168
13488
|
std::vector<llm_symbol> symbols_final;
|
13169
13489
|
|
@@ -13173,7 +13493,7 @@ private:
|
|
13173
13493
|
struct llm_tokenizer_wpm {
|
13174
13494
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
13175
13495
|
|
13176
|
-
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
13496
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
|
13177
13497
|
const auto & token_map = vocab.token_to_id;
|
13178
13498
|
|
13179
13499
|
// normalize and split by whitespace
|
@@ -13182,7 +13502,7 @@ struct llm_tokenizer_wpm {
|
|
13182
13502
|
// bos token prepended already
|
13183
13503
|
|
13184
13504
|
// find the longest tokens that form the words
|
13185
|
-
for (const std::string &word : words) {
|
13505
|
+
for (const std::string & word : words) {
|
13186
13506
|
// skip empty words
|
13187
13507
|
if (word.size() == 0) {
|
13188
13508
|
continue;
|
@@ -13199,7 +13519,7 @@ struct llm_tokenizer_wpm {
|
|
13199
13519
|
for (int i = 0; i < n; ++i) {
|
13200
13520
|
// loop through possible match length
|
13201
13521
|
bool match = false;
|
13202
|
-
for (int j = n; j > i; j--) {
|
13522
|
+
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
|
13203
13523
|
auto it = token_map.find(word1.substr(i, j - i));
|
13204
13524
|
if (it != token_map.end()) {
|
13205
13525
|
output.push_back(it->second);
|
@@ -13222,11 +13542,12 @@ struct llm_tokenizer_wpm {
|
|
13222
13542
|
}
|
13223
13543
|
}
|
13224
13544
|
|
13225
|
-
|
13545
|
+
// TODO: reduce string copies by using cpts_offs array
|
13546
|
+
std::vector<std::string> preprocess(const std::string & text) const {
|
13226
13547
|
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13227
13548
|
std::vector<std::string> words(1, "");
|
13228
13549
|
|
13229
|
-
for (const
|
13550
|
+
for (const uint32_t cpt : cpts_nfd) {
|
13230
13551
|
const auto flags = unicode_cpt_flags(cpt);
|
13231
13552
|
|
13232
13553
|
if (flags.is_whitespace) {
|
@@ -13444,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13444
13765
|
|
13445
13766
|
bool is_prev_special = false;
|
13446
13767
|
|
13447
|
-
if (add_special && vocab.
|
13768
|
+
if (add_special && vocab.tokenizer_add_bos) {
|
13448
13769
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
13449
13770
|
output.push_back(vocab.special_bos_id);
|
13450
13771
|
is_prev_special = true;
|
@@ -13454,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13454
13775
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13455
13776
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
13456
13777
|
|
13457
|
-
if (vocab.
|
13778
|
+
if (vocab.tokenizer_add_space_prefix) {
|
13458
13779
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13459
13780
|
raw_text = " " + raw_text;
|
13460
13781
|
}
|
@@ -13472,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13472
13793
|
}
|
13473
13794
|
}
|
13474
13795
|
|
13475
|
-
if (add_special && vocab.
|
13796
|
+
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13476
13797
|
LLAMA_LOG_WARN(
|
13477
13798
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13478
13799
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13479
13800
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
13480
13801
|
}
|
13481
13802
|
|
13482
|
-
if (add_special && vocab.
|
13803
|
+
if (add_special && vocab.tokenizer_add_eos) {
|
13483
13804
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
13484
13805
|
output.push_back(vocab.special_eos_id);
|
13485
13806
|
}
|
13486
13807
|
} break;
|
13487
13808
|
case LLAMA_VOCAB_TYPE_BPE:
|
13488
13809
|
{
|
13489
|
-
|
13490
|
-
|
13491
|
-
|
13810
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
13811
|
+
|
13812
|
+
if (add_special) {
|
13813
|
+
tokenizer.append_bos(output);
|
13492
13814
|
}
|
13493
13815
|
|
13494
13816
|
for (const auto & fragment : fragment_buffer) {
|
@@ -13498,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13498
13820
|
#ifdef PRETOKENIZERDEBUG
|
13499
13821
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
13500
13822
|
#endif
|
13501
|
-
llm_tokenizer_bpe tokenizer(vocab);
|
13502
13823
|
tokenizer.tokenize(raw_text, output);
|
13503
13824
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13504
|
-
|
13825
|
+
tokenizer.append(fragment.token, output);
|
13505
13826
|
}
|
13506
13827
|
}
|
13507
13828
|
|
13508
|
-
if (add_special
|
13509
|
-
|
13510
|
-
|
13511
|
-
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13512
|
-
"Are you sure this is what you want?\n", __FUNCTION__);
|
13513
|
-
}
|
13514
|
-
|
13515
|
-
if (add_special && vocab.special_add_eos == 1) {
|
13516
|
-
GGML_ASSERT(vocab.special_add_eos != -1);
|
13517
|
-
output.push_back(vocab.special_eos_id);
|
13829
|
+
if (add_special) {
|
13830
|
+
tokenizer.append_eos(output);
|
13831
|
+
tokenizer.check_double_bos_eos(output);
|
13518
13832
|
}
|
13519
13833
|
} break;
|
13520
13834
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -13524,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13524
13838
|
output.push_back(vocab.special_cls_id);
|
13525
13839
|
}
|
13526
13840
|
|
13841
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
13842
|
+
|
13527
13843
|
for (const auto & fragment : fragment_buffer) {
|
13528
13844
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13529
13845
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -13531,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13531
13847
|
#ifdef PRETOKENIZERDEBUG
|
13532
13848
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
13533
13849
|
#endif
|
13534
|
-
llm_tokenizer_wpm tokenizer(vocab);
|
13535
13850
|
tokenizer.tokenize(raw_text, output);
|
13536
13851
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13537
13852
|
output.push_back(fragment.token);
|
@@ -13631,7 +13946,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
13631
13946
|
const uint32_t chr) {
|
13632
13947
|
|
13633
13948
|
bool found = false;
|
13634
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
13949
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
13635
13950
|
|
13636
13951
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
13637
13952
|
|
@@ -13640,6 +13955,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
13640
13955
|
// inclusive range, e.g. [a-z]
|
13641
13956
|
found = found || (pos->value <= chr && chr <= pos[1].value);
|
13642
13957
|
pos += 2;
|
13958
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
13959
|
+
// Any character matches "."
|
13960
|
+
found = true;
|
13961
|
+
pos += 1;
|
13643
13962
|
} else {
|
13644
13963
|
// exact char match, e.g. [a] or "a"
|
13645
13964
|
found = found || pos->value == chr;
|
@@ -13657,7 +13976,7 @@ static bool llama_grammar_match_partial_char(
|
|
13657
13976
|
const llama_grammar_element * pos,
|
13658
13977
|
const llama_partial_utf8 partial_utf8) {
|
13659
13978
|
|
13660
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
13979
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
13661
13980
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
13662
13981
|
|
13663
13982
|
uint32_t partial_value = partial_utf8.value;
|
@@ -13687,6 +14006,9 @@ static bool llama_grammar_match_partial_char(
|
|
13687
14006
|
return is_positive_char;
|
13688
14007
|
}
|
13689
14008
|
pos += 2;
|
14009
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
14010
|
+
// Any character matches "."
|
14011
|
+
return true;
|
13690
14012
|
} else {
|
13691
14013
|
// exact char match, e.g. [a] or "a"
|
13692
14014
|
if (low <= pos->value && pos->value <= high) {
|
@@ -13747,6 +14069,7 @@ static void llama_grammar_advance_stack(
|
|
13747
14069
|
}
|
13748
14070
|
case LLAMA_GRETYPE_CHAR:
|
13749
14071
|
case LLAMA_GRETYPE_CHAR_NOT:
|
14072
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
13750
14073
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
13751
14074
|
// only add the stack if it's not a duplicate of one we already have
|
13752
14075
|
new_stacks.emplace_back(stack);
|
@@ -15220,6 +15543,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
15220
15543
|
if (imatrix_data) {
|
15221
15544
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
15222
15545
|
qs.has_imatrix = true;
|
15546
|
+
// check imatrix for nans or infs
|
15547
|
+
for (const auto & kv : *imatrix_data) {
|
15548
|
+
for (float f : kv.second) {
|
15549
|
+
if (!std::isfinite(f)) {
|
15550
|
+
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
|
15551
|
+
}
|
15552
|
+
}
|
15553
|
+
}
|
15223
15554
|
}
|
15224
15555
|
}
|
15225
15556
|
|
@@ -16024,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
|
|
16024
16355
|
params.flash_attn = false;
|
16025
16356
|
}
|
16026
16357
|
|
16358
|
+
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
|
16359
|
+
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
|
16360
|
+
params.flash_attn = false;
|
16361
|
+
}
|
16362
|
+
|
16027
16363
|
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
16028
16364
|
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
16029
16365
|
return nullptr;
|
@@ -16195,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
|
|
16195
16531
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
16196
16532
|
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
16197
16533
|
if (backend == nullptr) {
|
16198
|
-
|
16199
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
16534
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
16200
16535
|
llama_free(ctx);
|
16201
16536
|
return nullptr;
|
16202
16537
|
}
|
@@ -16226,6 +16561,16 @@ struct llama_context * llama_new_context_with_model(
|
|
16226
16561
|
ctx->backends.push_back(backend);
|
16227
16562
|
}
|
16228
16563
|
#endif
|
16564
|
+
|
16565
|
+
#ifdef GGML_USE_BLAS
|
16566
|
+
ctx->backend_blas = ggml_backend_blas_init();
|
16567
|
+
if (ctx->backend_blas == nullptr) {
|
16568
|
+
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
16569
|
+
} else {
|
16570
|
+
ctx->backends.push_back(ctx->backend_blas);
|
16571
|
+
}
|
16572
|
+
#endif
|
16573
|
+
|
16229
16574
|
#if defined(GGML_USE_RPC)
|
16230
16575
|
if (model->n_gpu_layers > 0) {
|
16231
16576
|
for (const auto & endpoint : model->rpc_servers) {
|
@@ -17814,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
|
|
17814
18159
|
ctx->abort_callback_data = abort_callback_data;
|
17815
18160
|
}
|
17816
18161
|
|
18162
|
+
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
18163
|
+
ctx->cparams.embeddings = embeddings;
|
18164
|
+
}
|
18165
|
+
|
17817
18166
|
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
17818
18167
|
ctx->cparams.causal_attn = causal_attn;
|
17819
18168
|
}
|
@@ -18057,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
18057
18406
|
}
|
18058
18407
|
|
18059
18408
|
int32_t llama_add_bos_token(const struct llama_model * model) {
|
18060
|
-
return model->vocab.
|
18409
|
+
return model->vocab.tokenizer_add_bos;
|
18061
18410
|
}
|
18062
18411
|
|
18063
18412
|
int32_t llama_add_eos_token(const struct llama_model * model) {
|
18064
|
-
return model->vocab.
|
18413
|
+
return model->vocab.tokenizer_add_eos;
|
18065
18414
|
}
|
18066
18415
|
|
18067
18416
|
llama_token llama_token_prefix(const struct llama_model * model) {
|