llama_cpp 0.16.0 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
|
@@ -21,6 +21,10 @@
|
|
|
21
21
|
# include "ggml-kompute.h"
|
|
22
22
|
#endif
|
|
23
23
|
|
|
24
|
+
#ifdef GGML_USE_BLAS
|
|
25
|
+
# include "ggml-blas.h"
|
|
26
|
+
#endif
|
|
27
|
+
|
|
24
28
|
#ifdef GGML_USE_METAL
|
|
25
29
|
# include "ggml-metal.h"
|
|
26
30
|
#endif
|
|
@@ -282,6 +286,7 @@ enum llm_kv {
|
|
|
282
286
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
|
283
287
|
LLM_KV_FEED_FORWARD_LENGTH,
|
|
284
288
|
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
289
|
+
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
|
285
290
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
|
286
291
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
|
287
292
|
LLM_KV_EXPERT_COUNT,
|
|
@@ -360,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
360
365
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
|
361
366
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
|
362
367
|
|
|
363
|
-
{ LLM_KV_VOCAB_SIZE,
|
|
364
|
-
{ LLM_KV_CONTEXT_LENGTH,
|
|
365
|
-
{ LLM_KV_EMBEDDING_LENGTH,
|
|
366
|
-
{ LLM_KV_BLOCK_COUNT,
|
|
367
|
-
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
|
368
|
-
{ LLM_KV_FEED_FORWARD_LENGTH,
|
|
369
|
-
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
370
|
-
{
|
|
371
|
-
{
|
|
372
|
-
{
|
|
373
|
-
{
|
|
374
|
-
{
|
|
375
|
-
{
|
|
376
|
-
{
|
|
377
|
-
{
|
|
368
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
|
369
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
370
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
|
371
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
|
372
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
|
373
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
|
374
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
|
375
|
+
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
|
376
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
|
377
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
|
378
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
|
379
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
|
380
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
|
381
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
|
382
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
|
383
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
378
384
|
|
|
379
385
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
|
380
386
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
@@ -704,6 +710,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
704
710
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
705
711
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
706
712
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
713
|
+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
|
707
714
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
708
715
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
709
716
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
@@ -1273,6 +1280,126 @@ struct no_init {
|
|
|
1273
1280
|
};
|
|
1274
1281
|
|
|
1275
1282
|
struct llama_file {
|
|
1283
|
+
|
|
1284
|
+
#if defined(_WIN32)
|
|
1285
|
+
// use FILE * so we don't have to re-open the file to mmap
|
|
1286
|
+
FILE * fp;
|
|
1287
|
+
HANDLE fp_win32;
|
|
1288
|
+
size_t size;
|
|
1289
|
+
|
|
1290
|
+
private:
|
|
1291
|
+
std::string GetErrorMessageWin32(DWORD error_code) const {
|
|
1292
|
+
std::string ret;
|
|
1293
|
+
LPSTR lpMsgBuf = NULL;
|
|
1294
|
+
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
|
1295
|
+
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
|
1296
|
+
if (!bufLen) {
|
|
1297
|
+
ret = format("Win32 error code: %s", error_code);
|
|
1298
|
+
} else {
|
|
1299
|
+
ret = lpMsgBuf;
|
|
1300
|
+
LocalFree(lpMsgBuf);
|
|
1301
|
+
}
|
|
1302
|
+
|
|
1303
|
+
return ret;
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
public:
|
|
1307
|
+
|
|
1308
|
+
llama_file(const char * fname, const char * mode) {
|
|
1309
|
+
fp = ggml_fopen(fname, mode);
|
|
1310
|
+
if (fp == NULL) {
|
|
1311
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
1312
|
+
}
|
|
1313
|
+
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
|
1314
|
+
seek(0, SEEK_END);
|
|
1315
|
+
size = tell();
|
|
1316
|
+
seek(0, SEEK_SET);
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1319
|
+
size_t tell() const {
|
|
1320
|
+
// SetFilePointerEx returns the current position when seeking relative 0 bytes
|
|
1321
|
+
LARGE_INTEGER li;
|
|
1322
|
+
li.QuadPart = 0;
|
|
1323
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
|
|
1324
|
+
if (!ret) {
|
|
1325
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
return li.QuadPart;
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
void seek(size_t offset, int whence) const {
|
|
1332
|
+
// no need to convert SEEK_* to FILE_*. The enums are the same.
|
|
1333
|
+
// Still, keep static asserts to avoid failures in the future.
|
|
1334
|
+
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
|
|
1335
|
+
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
|
|
1336
|
+
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
|
|
1337
|
+
|
|
1338
|
+
LARGE_INTEGER li;
|
|
1339
|
+
li.QuadPart = offset;
|
|
1340
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
|
|
1341
|
+
if (!ret) {
|
|
1342
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
void read_raw(void * ptr, size_t len) const {
|
|
1347
|
+
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
|
|
1348
|
+
// use the Win32 API to do file io instead of the C/C++ library functions.
|
|
1349
|
+
|
|
1350
|
+
// There are conditions under which ReadFile cannot read chunks >64MB.
|
|
1351
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
|
1352
|
+
size_t bytes_read = 0;
|
|
1353
|
+
while (bytes_read < len) {
|
|
1354
|
+
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
|
1355
|
+
DWORD chunk_read = 0;
|
|
1356
|
+
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
|
|
1357
|
+
if (!result) {
|
|
1358
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
|
1359
|
+
}
|
|
1360
|
+
if (chunk_read < chunk_size || chunk_read == 0) {
|
|
1361
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
bytes_read += chunk_read;
|
|
1365
|
+
} ;
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
uint32_t read_u32() const {
|
|
1369
|
+
uint32_t val;
|
|
1370
|
+
read_raw(&val, sizeof(val));
|
|
1371
|
+
return val;
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1374
|
+
void write_raw(const void * ptr, size_t len) const {
|
|
1375
|
+
// There are conditions under which WriteFile cannot write chunks >64MB.
|
|
1376
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
|
1377
|
+
size_t bytes_written = 0;
|
|
1378
|
+
while (bytes_written < len) {
|
|
1379
|
+
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
|
|
1380
|
+
DWORD chunk_written = 0;
|
|
1381
|
+
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
|
|
1382
|
+
if (!result) {
|
|
1383
|
+
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
|
1384
|
+
}
|
|
1385
|
+
if (chunk_written < chunk_size || chunk_written == 0) {
|
|
1386
|
+
throw std::runtime_error("unexpectedly failed to write bytes");
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
bytes_written += chunk_written;
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
void write_u32(std::uint32_t val) const {
|
|
1394
|
+
write_raw(&val, sizeof(val));
|
|
1395
|
+
}
|
|
1396
|
+
|
|
1397
|
+
~llama_file() {
|
|
1398
|
+
if (fp) {
|
|
1399
|
+
std::fclose(fp);
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
1402
|
+
#else
|
|
1276
1403
|
// use FILE * so we don't have to re-open the file to mmap
|
|
1277
1404
|
FILE * fp;
|
|
1278
1405
|
size_t size;
|
|
@@ -1293,7 +1420,10 @@ struct llama_file {
|
|
|
1293
1420
|
#else
|
|
1294
1421
|
long ret = std::ftell(fp);
|
|
1295
1422
|
#endif
|
|
1296
|
-
|
|
1423
|
+
if (ret == -1) {
|
|
1424
|
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1297
1427
|
return (size_t) ret;
|
|
1298
1428
|
}
|
|
1299
1429
|
|
|
@@ -1303,7 +1433,9 @@ struct llama_file {
|
|
|
1303
1433
|
#else
|
|
1304
1434
|
int ret = std::fseek(fp, (long) offset, whence);
|
|
1305
1435
|
#endif
|
|
1306
|
-
|
|
1436
|
+
if (ret != 0) {
|
|
1437
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
1438
|
+
}
|
|
1307
1439
|
}
|
|
1308
1440
|
|
|
1309
1441
|
void read_raw(void * ptr, size_t len) const {
|
|
@@ -1346,6 +1478,7 @@ struct llama_file {
|
|
|
1346
1478
|
std::fclose(fp);
|
|
1347
1479
|
}
|
|
1348
1480
|
}
|
|
1481
|
+
#endif
|
|
1349
1482
|
};
|
|
1350
1483
|
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
|
1351
1484
|
|
|
@@ -1839,6 +1972,7 @@ struct llama_hparams {
|
|
|
1839
1972
|
uint32_t n_lora_q = 0;
|
|
1840
1973
|
uint32_t n_lora_kv = 0;
|
|
1841
1974
|
uint32_t n_ff_exp = 0;
|
|
1975
|
+
uint32_t n_ff_shexp = 0;
|
|
1842
1976
|
uint32_t n_expert_shared = 0;
|
|
1843
1977
|
float expert_weights_scale = 0.0;
|
|
1844
1978
|
|
|
@@ -1887,6 +2021,7 @@ struct llama_hparams {
|
|
|
1887
2021
|
if (this->n_lora_q != other.n_lora_q) return true;
|
|
1888
2022
|
if (this->n_lora_kv != other.n_lora_kv) return true;
|
|
1889
2023
|
if (this->n_ff_exp != other.n_ff_exp) return true;
|
|
2024
|
+
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
|
1890
2025
|
if (this->n_expert_shared != other.n_expert_shared) return true;
|
|
1891
2026
|
|
|
1892
2027
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
@@ -2158,6 +2293,8 @@ struct llama_vocab {
|
|
|
2158
2293
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
2159
2294
|
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
2160
2295
|
|
|
2296
|
+
int max_token_len = 0; // used for optimizing longest token search
|
|
2297
|
+
|
|
2161
2298
|
std::unordered_map<token, id> token_to_id;
|
|
2162
2299
|
std::vector<token_data> id_to_token;
|
|
2163
2300
|
|
|
@@ -2175,16 +2312,17 @@ struct llama_vocab {
|
|
|
2175
2312
|
id special_cls_id = -1;
|
|
2176
2313
|
id special_mask_id = -1;
|
|
2177
2314
|
|
|
2178
|
-
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
|
2179
|
-
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
|
2180
|
-
|
|
2181
2315
|
id linefeed_id = 13;
|
|
2182
2316
|
id special_prefix_id = -1;
|
|
2183
2317
|
id special_suffix_id = -1;
|
|
2184
2318
|
id special_middle_id = -1;
|
|
2185
2319
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
|
2186
2320
|
|
|
2187
|
-
|
|
2321
|
+
// tokenizer flags
|
|
2322
|
+
bool tokenizer_add_space_prefix = true;
|
|
2323
|
+
bool tokenizer_add_bos = false;
|
|
2324
|
+
bool tokenizer_add_eos = false;
|
|
2325
|
+
bool tokenizer_ignore_merges = false;
|
|
2188
2326
|
|
|
2189
2327
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
|
2190
2328
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
|
@@ -2298,9 +2436,13 @@ struct llama_context {
|
|
|
2298
2436
|
std::vector<ggml_backend_t> backends;
|
|
2299
2437
|
#ifdef GGML_USE_METAL
|
|
2300
2438
|
ggml_backend_t backend_metal = nullptr;
|
|
2439
|
+
#endif
|
|
2440
|
+
#ifdef GGML_USE_BLAS
|
|
2441
|
+
ggml_backend_t backend_blas = nullptr;
|
|
2301
2442
|
#endif
|
|
2302
2443
|
ggml_backend_t backend_cpu = nullptr;
|
|
2303
2444
|
|
|
2445
|
+
|
|
2304
2446
|
const llama_model & model;
|
|
2305
2447
|
|
|
2306
2448
|
// key + value cache for the self attention
|
|
@@ -3712,6 +3854,44 @@ struct llama_model_loader {
|
|
|
3712
3854
|
std::vector<no_init<uint8_t>> read_buf;
|
|
3713
3855
|
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
|
3714
3856
|
|
|
3857
|
+
#if defined(GGML_USE_CUDA)
|
|
3858
|
+
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
|
3859
|
+
// NVMe raid configurations might require more / larger buffers.
|
|
3860
|
+
constexpr size_t num_buffers = 4;
|
|
3861
|
+
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
|
3862
|
+
|
|
3863
|
+
std::vector<ggml_backend_buffer_t> host_buffers;
|
|
3864
|
+
std::vector<void*> host_ptrs;
|
|
3865
|
+
std::vector<ggml_backend_event_t> events;
|
|
3866
|
+
size_t buffer_idx = 0; // buffer to use for async loads
|
|
3867
|
+
|
|
3868
|
+
ggml_backend_t cuda_backend = nullptr;
|
|
3869
|
+
if (!use_mmap && !check_tensors) {
|
|
3870
|
+
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
|
3871
|
+
// First determine if the CUDA backend is active, and if so, determine the device ID.
|
|
3872
|
+
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
|
|
3873
|
+
if (buf) {
|
|
3874
|
+
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
|
|
3875
|
+
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
|
3876
|
+
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
|
3877
|
+
if (buffer_type == cuda_buffer_type) {
|
|
3878
|
+
cuda_backend = ggml_backend_cuda_init(i);
|
|
3879
|
+
break;
|
|
3880
|
+
}
|
|
3881
|
+
}
|
|
3882
|
+
}
|
|
3883
|
+
|
|
3884
|
+
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
|
|
3885
|
+
if (cuda_backend) {
|
|
3886
|
+
for (size_t idx = 0; idx < num_buffers; ++idx) {
|
|
3887
|
+
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
|
3888
|
+
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
|
3889
|
+
events.emplace_back(ggml_backend_event_new(cuda_backend));
|
|
3890
|
+
}
|
|
3891
|
+
}
|
|
3892
|
+
}
|
|
3893
|
+
#endif
|
|
3894
|
+
|
|
3715
3895
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
3716
3896
|
const auto * weight = get_weight(ggml_get_name(cur));
|
|
3717
3897
|
if (weight == nullptr) {
|
|
@@ -3767,12 +3947,36 @@ struct llama_model_loader {
|
|
|
3767
3947
|
}));
|
|
3768
3948
|
}
|
|
3769
3949
|
} else {
|
|
3770
|
-
|
|
3771
|
-
|
|
3772
|
-
|
|
3773
|
-
|
|
3774
|
-
|
|
3775
|
-
|
|
3950
|
+
#if defined(GGML_USE_CUDA)
|
|
3951
|
+
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
|
3952
|
+
if (cuda_backend) {
|
|
3953
|
+
file->seek(weight->offs, SEEK_SET);
|
|
3954
|
+
|
|
3955
|
+
size_t bytes_read = 0;
|
|
3956
|
+
|
|
3957
|
+
while (bytes_read < n_size) {
|
|
3958
|
+
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
|
3959
|
+
|
|
3960
|
+
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
3961
|
+
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
|
3962
|
+
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
|
3963
|
+
ggml_backend_event_record(events[buffer_idx]);
|
|
3964
|
+
|
|
3965
|
+
bytes_read += read_iteration;
|
|
3966
|
+
++buffer_idx;
|
|
3967
|
+
buffer_idx %= num_buffers;
|
|
3968
|
+
}
|
|
3969
|
+
}
|
|
3970
|
+
else
|
|
3971
|
+
#endif
|
|
3972
|
+
{
|
|
3973
|
+
read_buf.resize(n_size);
|
|
3974
|
+
file->seek(weight->offs, SEEK_SET);
|
|
3975
|
+
file->read_raw(read_buf.data(), n_size);
|
|
3976
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
3977
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
3978
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
3979
|
+
}
|
|
3776
3980
|
}
|
|
3777
3981
|
}
|
|
3778
3982
|
}
|
|
@@ -3780,6 +3984,18 @@ struct llama_model_loader {
|
|
|
3780
3984
|
size_done += n_size;
|
|
3781
3985
|
}
|
|
3782
3986
|
|
|
3987
|
+
#if defined(GGML_USE_CUDA)
|
|
3988
|
+
// free temporary resources used for async cuda uploads
|
|
3989
|
+
if (cuda_backend) {
|
|
3990
|
+
for (size_t idx = 0; idx < num_buffers;++idx) {
|
|
3991
|
+
ggml_backend_event_synchronize(events[idx]);
|
|
3992
|
+
ggml_backend_event_free(events[idx]);
|
|
3993
|
+
ggml_backend_buffer_free(host_buffers[idx]);
|
|
3994
|
+
}
|
|
3995
|
+
ggml_backend_free(cuda_backend);
|
|
3996
|
+
}
|
|
3997
|
+
#endif
|
|
3998
|
+
|
|
3783
3999
|
// check validation results
|
|
3784
4000
|
bool validation_failed = false;
|
|
3785
4001
|
for (auto & future : validation_result) {
|
|
@@ -4246,6 +4462,9 @@ static void llm_load_hparams(
|
|
|
4246
4462
|
} break;
|
|
4247
4463
|
case LLM_ARCH_QWEN2MOE:
|
|
4248
4464
|
{
|
|
4465
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
4466
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
4467
|
+
|
|
4249
4468
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
4250
4469
|
switch (hparams.n_layer) {
|
|
4251
4470
|
case 24: model.type = e_model::MODEL_A2_7B; break;
|
|
@@ -4552,38 +4771,9 @@ static void llm_load_vocab(
|
|
|
4552
4771
|
vocab.special_cls_id = -1;
|
|
4553
4772
|
vocab.special_mask_id = -1;
|
|
4554
4773
|
|
|
4555
|
-
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
4556
|
-
// prior to support of FIM special tokens in GGUF, the following
|
|
4557
|
-
// will allow those models to continue to work. The general names
|
|
4558
|
-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
|
4559
|
-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
|
4560
|
-
// new versions of these models have been published.
|
|
4561
|
-
std::string gen_name;
|
|
4562
|
-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
|
4563
|
-
|
|
4564
|
-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
|
4565
|
-
[](unsigned char c){ return std::tolower(c); });
|
|
4566
|
-
|
|
4567
|
-
if (gen_name.find("code") != std::string::npos) {
|
|
4568
|
-
if (model.arch == LLM_ARCH_LLAMA) {
|
|
4569
|
-
vocab.special_prefix_id = 32007;
|
|
4570
|
-
vocab.special_suffix_id = 32008;
|
|
4571
|
-
vocab.special_middle_id = 32009;
|
|
4572
|
-
vocab.special_eot_id = 32010;
|
|
4573
|
-
} else if (model.arch == LLM_ARCH_GEMMA) {
|
|
4574
|
-
vocab.special_prefix_id = 67;
|
|
4575
|
-
vocab.special_suffix_id = 69;
|
|
4576
|
-
vocab.special_middle_id = 68;
|
|
4577
|
-
// TODO: this is not EOT, it is "file separator" token, needs fix
|
|
4578
|
-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
|
4579
|
-
//vocab.special_eot_id = 70;
|
|
4580
|
-
vocab.special_eot_id = 107;
|
|
4581
|
-
}
|
|
4582
|
-
}
|
|
4583
|
-
|
|
4584
4774
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
|
4585
4775
|
if (add_space_prefix_keyidx != -1) {
|
|
4586
|
-
vocab.
|
|
4776
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
4587
4777
|
} // The default value of add_space_prefix is true.
|
|
4588
4778
|
} else if (tokenizer_model == "bert") {
|
|
4589
4779
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
|
@@ -4596,13 +4786,13 @@ static void llm_load_vocab(
|
|
|
4596
4786
|
vocab.special_pad_id = 0;
|
|
4597
4787
|
vocab.special_cls_id = 101;
|
|
4598
4788
|
vocab.special_mask_id = 103;
|
|
4599
|
-
vocab.
|
|
4789
|
+
vocab.tokenizer_add_space_prefix = false;
|
|
4600
4790
|
} else if (tokenizer_model == "gpt2") {
|
|
4601
4791
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
|
4602
4792
|
|
|
4603
4793
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
|
4604
4794
|
if (add_space_prefix_keyidx != -1) {
|
|
4605
|
-
vocab.
|
|
4795
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
4606
4796
|
}
|
|
4607
4797
|
|
|
4608
4798
|
// read bpe merges and populate bpe ranks
|
|
@@ -4653,14 +4843,15 @@ static void llm_load_vocab(
|
|
|
4653
4843
|
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
|
4654
4844
|
LLAMA_LOG_WARN("%s: \n", __func__);
|
|
4655
4845
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4656
|
-
} else if (
|
|
4657
|
-
tokenizer_pre == "default") {
|
|
4846
|
+
} else if (tokenizer_pre == "default") {
|
|
4658
4847
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4659
4848
|
} else if (
|
|
4660
4849
|
tokenizer_pre == "llama3" ||
|
|
4661
4850
|
tokenizer_pre == "llama-v3" ||
|
|
4662
4851
|
tokenizer_pre == "llama-bpe") {
|
|
4663
4852
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
|
4853
|
+
vocab.tokenizer_ignore_merges = true;
|
|
4854
|
+
vocab.tokenizer_add_bos = true;
|
|
4664
4855
|
} else if (
|
|
4665
4856
|
tokenizer_pre == "deepseek-llm") {
|
|
4666
4857
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
|
@@ -4681,7 +4872,8 @@ static void llm_load_vocab(
|
|
|
4681
4872
|
tokenizer_pre == "jina-es" ||
|
|
4682
4873
|
tokenizer_pre == "jina-de" ||
|
|
4683
4874
|
tokenizer_pre == "jina-v2-es" ||
|
|
4684
|
-
tokenizer_pre == "jina-v2-de"
|
|
4875
|
+
tokenizer_pre == "jina-v2-de" ||
|
|
4876
|
+
tokenizer_pre == "jina-v2-code") {
|
|
4685
4877
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
4686
4878
|
} else if (
|
|
4687
4879
|
tokenizer_pre == "refact") {
|
|
@@ -4704,9 +4896,20 @@ static void llm_load_vocab(
|
|
|
4704
4896
|
} else if (
|
|
4705
4897
|
tokenizer_pre == "smaug-bpe") {
|
|
4706
4898
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
|
4899
|
+
} else if (
|
|
4900
|
+
tokenizer_pre == "poro-chat") {
|
|
4901
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
|
4707
4902
|
} else {
|
|
4708
4903
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
4709
4904
|
}
|
|
4905
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
|
4906
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4907
|
+
vocab.tokenizer_add_bos = true;
|
|
4908
|
+
vocab.tokenizer_add_eos = false;
|
|
4909
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
|
4910
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4911
|
+
vocab.tokenizer_add_bos = true;
|
|
4912
|
+
vocab.tokenizer_add_eos = false;
|
|
4710
4913
|
} else {
|
|
4711
4914
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4712
4915
|
}
|
|
@@ -4738,6 +4941,7 @@ static void llm_load_vocab(
|
|
|
4738
4941
|
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
|
4739
4942
|
|
|
4740
4943
|
vocab.token_to_id[word] = i;
|
|
4944
|
+
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
|
4741
4945
|
|
|
4742
4946
|
auto & token_data = vocab.id_to_token[i];
|
|
4743
4947
|
token_data.text = std::move(word);
|
|
@@ -4761,6 +4965,45 @@ static void llm_load_vocab(
|
|
|
4761
4965
|
|
|
4762
4966
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
|
4763
4967
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
|
4968
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
4969
|
+
// prior to support of FIM special tokens in GGUF, the following
|
|
4970
|
+
// will allow those models to continue to work. The general names
|
|
4971
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
|
4972
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
|
4973
|
+
// new versions of these models have been published.
|
|
4974
|
+
std::string gen_name;
|
|
4975
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
|
4976
|
+
|
|
4977
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
|
4978
|
+
[](unsigned char c){ return std::tolower(c); });
|
|
4979
|
+
|
|
4980
|
+
if (gen_name.find("code") != std::string::npos) {
|
|
4981
|
+
if (model.arch == LLM_ARCH_LLAMA
|
|
4982
|
+
&& 32010 < vocab.id_to_token.size()
|
|
4983
|
+
&& vocab.id_to_token[32007].text == "<PRE>"
|
|
4984
|
+
&& vocab.id_to_token[32008].text == "<SUF>"
|
|
4985
|
+
&& vocab.id_to_token[32009].text == "<MID>"
|
|
4986
|
+
&& vocab.id_to_token[32010].text == "<EOT>") {
|
|
4987
|
+
vocab.special_prefix_id = 32007;
|
|
4988
|
+
vocab.special_suffix_id = 32008;
|
|
4989
|
+
vocab.special_middle_id = 32009;
|
|
4990
|
+
vocab.special_eot_id = 32010;
|
|
4991
|
+
} else if (model.arch == LLM_ARCH_GEMMA
|
|
4992
|
+
&& 107 < vocab.id_to_token.size()
|
|
4993
|
+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
|
4994
|
+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
|
4995
|
+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
|
4996
|
+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
|
4997
|
+
vocab.special_prefix_id = 67;
|
|
4998
|
+
vocab.special_suffix_id = 69;
|
|
4999
|
+
vocab.special_middle_id = 68;
|
|
5000
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
|
5001
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
|
5002
|
+
//vocab.special_eot_id = 70;
|
|
5003
|
+
vocab.special_eot_id = 107;
|
|
5004
|
+
}
|
|
5005
|
+
}
|
|
5006
|
+
|
|
4764
5007
|
try {
|
|
4765
5008
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
|
4766
5009
|
} catch (const std::exception & e) {
|
|
@@ -4812,10 +5055,10 @@ static void llm_load_vocab(
|
|
|
4812
5055
|
bool temp = true;
|
|
4813
5056
|
|
|
4814
5057
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
|
4815
|
-
vocab.
|
|
5058
|
+
vocab.tokenizer_add_bos = temp;
|
|
4816
5059
|
}
|
|
4817
5060
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
|
4818
|
-
vocab.
|
|
5061
|
+
vocab.tokenizer_add_eos = temp;
|
|
4819
5062
|
}
|
|
4820
5063
|
}
|
|
4821
5064
|
|
|
@@ -4915,7 +5158,7 @@ static void llm_load_vocab(
|
|
|
4915
5158
|
);
|
|
4916
5159
|
|
|
4917
5160
|
// set attributes by model/tokenizer name
|
|
4918
|
-
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-
|
|
5161
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
|
|
4919
5162
|
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
4920
5163
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
4921
5164
|
for (auto id : vocab.cache_special_tokens) {
|
|
@@ -5009,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
5009
5252
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
|
5010
5253
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
|
5011
5254
|
|
|
5255
|
+
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
|
5256
|
+
|
|
5012
5257
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
|
5013
5258
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
5014
5259
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
@@ -5018,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
5018
5263
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
5019
5264
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
5020
5265
|
}
|
|
5266
|
+
|
|
5267
|
+
if (model.arch == LLM_ARCH_QWEN2MOE) {
|
|
5268
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
5269
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
5270
|
+
}
|
|
5021
5271
|
}
|
|
5022
5272
|
|
|
5023
5273
|
// Returns false if cancelled by progress_callback
|
|
@@ -5161,7 +5411,7 @@ static bool llm_load_tensors(
|
|
|
5161
5411
|
// create tensors for the weights
|
|
5162
5412
|
{
|
|
5163
5413
|
const int64_t n_embd = hparams.n_embd;
|
|
5164
|
-
const int64_t n_embd_head = n_embd / hparams.n_head;
|
|
5414
|
+
const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
|
|
5165
5415
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
5166
5416
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
5167
5417
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
@@ -5515,7 +5765,7 @@ static bool llm_load_tensors(
|
|
|
5515
5765
|
|
|
5516
5766
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
5517
5767
|
} else {
|
|
5518
|
-
layer.ffn_gate
|
|
5768
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5519
5769
|
}
|
|
5520
5770
|
|
|
5521
5771
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
@@ -5556,6 +5806,9 @@ static bool llm_load_tensors(
|
|
|
5556
5806
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
|
5557
5807
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
|
5558
5808
|
|
|
5809
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5810
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5811
|
+
|
|
5559
5812
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5560
5813
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5561
5814
|
|
|
@@ -5801,16 +6054,17 @@ static bool llm_load_tensors(
|
|
|
5801
6054
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
|
5802
6055
|
|
|
5803
6056
|
// MoE branch
|
|
5804
|
-
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
|
6057
|
+
auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
|
|
5805
6058
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
|
5806
6059
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
|
5807
6060
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
|
5808
6061
|
|
|
5809
6062
|
// Shared expert branch
|
|
6063
|
+
auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
|
5810
6064
|
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
|
5811
|
-
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,
|
|
5812
|
-
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {
|
|
5813
|
-
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd,
|
|
6065
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
|
6066
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
|
|
6067
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
|
5814
6068
|
}
|
|
5815
6069
|
} break;
|
|
5816
6070
|
case LLM_ARCH_PHI2:
|
|
@@ -6600,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
|
6600
6854
|
}
|
|
6601
6855
|
#endif
|
|
6602
6856
|
|
|
6603
|
-
#ifdef GGML_USE_SYCL
|
|
6604
|
-
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
6605
|
-
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
|
6606
|
-
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
|
6607
|
-
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
|
6608
|
-
} else {
|
|
6609
|
-
ggml_backend_sycl_set_mul_device_mode();
|
|
6610
|
-
}
|
|
6611
|
-
#endif
|
|
6612
|
-
|
|
6613
6857
|
if (!llm_load_tensors(
|
|
6614
6858
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
|
6615
6859
|
params.progress_callback, params.progress_callback_user_data
|
|
@@ -7410,6 +7654,50 @@ struct llm_build_context {
|
|
|
7410
7654
|
return lctx.inp_s_seq;
|
|
7411
7655
|
}
|
|
7412
7656
|
|
|
7657
|
+
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
|
7658
|
+
// find result_norm tensor for input
|
|
7659
|
+
struct ggml_tensor * inp = nullptr;
|
|
7660
|
+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
|
7661
|
+
inp = gf->nodes[i];
|
|
7662
|
+
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
|
7663
|
+
break;
|
|
7664
|
+
} else {
|
|
7665
|
+
inp = nullptr;
|
|
7666
|
+
}
|
|
7667
|
+
}
|
|
7668
|
+
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
|
7669
|
+
|
|
7670
|
+
struct ggml_tensor * cur;
|
|
7671
|
+
|
|
7672
|
+
switch (pooling_type) {
|
|
7673
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
|
7674
|
+
{
|
|
7675
|
+
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
7676
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
|
7677
|
+
} break;
|
|
7678
|
+
case LLAMA_POOLING_TYPE_CLS:
|
|
7679
|
+
case LLAMA_POOLING_TYPE_LAST:
|
|
7680
|
+
{
|
|
7681
|
+
struct ggml_tensor * inp_cls = build_inp_cls();
|
|
7682
|
+
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
|
7683
|
+
} break;
|
|
7684
|
+
case LLAMA_POOLING_TYPE_NONE:
|
|
7685
|
+
{
|
|
7686
|
+
cur = inp;
|
|
7687
|
+
} break;
|
|
7688
|
+
default:
|
|
7689
|
+
{
|
|
7690
|
+
GGML_ASSERT(false && "unknown pooling type");
|
|
7691
|
+
} break;
|
|
7692
|
+
}
|
|
7693
|
+
|
|
7694
|
+
cb(cur, "result_embd_pooled", -1);
|
|
7695
|
+
|
|
7696
|
+
ggml_build_forward_expand(gf, cur);
|
|
7697
|
+
|
|
7698
|
+
return gf;
|
|
7699
|
+
}
|
|
7700
|
+
|
|
7413
7701
|
struct ggml_cgraph * build_llama() {
|
|
7414
7702
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
7415
7703
|
|
|
@@ -8390,8 +8678,6 @@ struct llm_build_context {
|
|
|
8390
8678
|
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
|
8391
8679
|
inp_pos = build_inp_pos();
|
|
8392
8680
|
}
|
|
8393
|
-
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
8394
|
-
struct ggml_tensor * inp_cls = build_inp_cls();
|
|
8395
8681
|
|
|
8396
8682
|
// construct input embeddings (token, type, position)
|
|
8397
8683
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
@@ -8519,6 +8805,11 @@ struct llm_build_context {
|
|
|
8519
8805
|
// attention layer norm
|
|
8520
8806
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
|
8521
8807
|
|
|
8808
|
+
if (model.layers[il].attn_norm_2 != nullptr) {
|
|
8809
|
+
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
|
|
8810
|
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
|
|
8811
|
+
}
|
|
8812
|
+
|
|
8522
8813
|
struct ggml_tensor * ffn_inp = cur;
|
|
8523
8814
|
cb(ffn_inp, "ffn_inp", il);
|
|
8524
8815
|
|
|
@@ -8561,28 +8852,6 @@ struct llm_build_context {
|
|
|
8561
8852
|
cur = inpL;
|
|
8562
8853
|
cb(cur, "result_embd", -1);
|
|
8563
8854
|
|
|
8564
|
-
// pooling layer
|
|
8565
|
-
switch (pooling_type) {
|
|
8566
|
-
case LLAMA_POOLING_TYPE_NONE:
|
|
8567
|
-
{
|
|
8568
|
-
// nop
|
|
8569
|
-
} break;
|
|
8570
|
-
case LLAMA_POOLING_TYPE_MEAN:
|
|
8571
|
-
{
|
|
8572
|
-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
|
8573
|
-
cb(cur, "result_embd_pooled", -1);
|
|
8574
|
-
} break;
|
|
8575
|
-
case LLAMA_POOLING_TYPE_CLS:
|
|
8576
|
-
{
|
|
8577
|
-
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
|
8578
|
-
cb(cur, "result_embd_pooled", -1);
|
|
8579
|
-
} break;
|
|
8580
|
-
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
|
8581
|
-
{
|
|
8582
|
-
GGML_ASSERT(false && "Invalid pooling type");
|
|
8583
|
-
} break;
|
|
8584
|
-
}
|
|
8585
|
-
|
|
8586
8855
|
ggml_build_forward_expand(gf, cur);
|
|
8587
8856
|
|
|
8588
8857
|
return gf;
|
|
@@ -11520,7 +11789,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
11520
11789
|
if (batch.n_tokens < 32 || full_offload) {
|
|
11521
11790
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
|
11522
11791
|
for (auto * backend : lctx.backends) {
|
|
11523
|
-
if (
|
|
11792
|
+
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
|
11793
|
+
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
|
11524
11794
|
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
|
11525
11795
|
break;
|
|
11526
11796
|
}
|
|
@@ -11666,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
11666
11936
|
GGML_ASSERT(false);
|
|
11667
11937
|
}
|
|
11668
11938
|
|
|
11939
|
+
// add on pooling layer
|
|
11940
|
+
if (lctx.cparams.embeddings) {
|
|
11941
|
+
result = llm.append_pooling(result);
|
|
11942
|
+
}
|
|
11943
|
+
|
|
11669
11944
|
llm.free();
|
|
11670
11945
|
|
|
11671
11946
|
return result;
|
|
@@ -11755,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11755
12030
|
// (!a || b) is a logical implication (a -> b)
|
|
11756
12031
|
// !hparams.causal_attn -> !cparams.causal_attn
|
|
11757
12032
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
|
11758
|
-
"causal attention
|
|
12033
|
+
"causal attention is not supported by this model"
|
|
11759
12034
|
);
|
|
11760
12035
|
|
|
11761
12036
|
if (lctx.inp_KQ_mask) {
|
|
@@ -11887,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11887
12162
|
}
|
|
11888
12163
|
}
|
|
11889
12164
|
|
|
12165
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
|
12166
|
+
const int64_t n_tokens = batch.n_tokens;
|
|
12167
|
+
|
|
12168
|
+
GGML_ASSERT(lctx.inp_cls);
|
|
12169
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
|
12170
|
+
|
|
12171
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
|
12172
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
|
12173
|
+
|
|
12174
|
+
std::vector<int> last_pos(n_tokens, -1);
|
|
12175
|
+
std::vector<int> last_row(n_tokens, -1);
|
|
12176
|
+
|
|
12177
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
12178
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
12179
|
+
const llama_pos pos = batch.pos[i];
|
|
12180
|
+
|
|
12181
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
|
|
12182
|
+
|
|
12183
|
+
if (pos >= last_pos[seq_id]) {
|
|
12184
|
+
last_pos[seq_id] = pos;
|
|
12185
|
+
last_row[seq_id] = i;
|
|
12186
|
+
}
|
|
12187
|
+
}
|
|
12188
|
+
|
|
12189
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
12190
|
+
if (last_row[i] >= 0) {
|
|
12191
|
+
data[i] = last_row[i];
|
|
12192
|
+
}
|
|
12193
|
+
}
|
|
12194
|
+
}
|
|
12195
|
+
|
|
11890
12196
|
if (kv_self.recurrent) {
|
|
11891
12197
|
const int64_t n_kv = kv_self.n;
|
|
11892
12198
|
|
|
@@ -11948,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
|
11948
12254
|
const auto n_embd = hparams.n_embd;
|
|
11949
12255
|
|
|
11950
12256
|
// TODO: use a per-batch flag for logits presence instead
|
|
11951
|
-
const bool has_logits = cparams.
|
|
11952
|
-
const bool has_embd =
|
|
12257
|
+
const bool has_logits = !cparams.embeddings;
|
|
12258
|
+
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
|
11953
12259
|
|
|
11954
12260
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
|
11955
12261
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
|
@@ -12017,6 +12323,11 @@ static void llama_graph_compute(
|
|
|
12017
12323
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
12018
12324
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
|
12019
12325
|
}
|
|
12326
|
+
#ifdef GGML_USE_BLAS
|
|
12327
|
+
if (lctx.backend_blas != nullptr) {
|
|
12328
|
+
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
|
|
12329
|
+
}
|
|
12330
|
+
#endif
|
|
12020
12331
|
|
|
12021
12332
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
|
12022
12333
|
|
|
@@ -12074,11 +12385,13 @@ static int llama_decode_internal(
|
|
|
12074
12385
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
|
12075
12386
|
|
|
12076
12387
|
// count outputs
|
|
12077
|
-
if (
|
|
12388
|
+
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
|
|
12389
|
+
n_outputs = n_tokens_all;
|
|
12390
|
+
} else if (batch_all.logits) {
|
|
12078
12391
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
|
12079
12392
|
n_outputs += batch_all.logits[i] != 0;
|
|
12080
12393
|
}
|
|
12081
|
-
} else if (lctx.logits_all
|
|
12394
|
+
} else if (lctx.logits_all) {
|
|
12082
12395
|
n_outputs = n_tokens_all;
|
|
12083
12396
|
} else {
|
|
12084
12397
|
// keep last output only
|
|
@@ -12209,47 +12522,19 @@ static int llama_decode_internal(
|
|
|
12209
12522
|
// no output
|
|
12210
12523
|
res = nullptr;
|
|
12211
12524
|
embd = nullptr;
|
|
12212
|
-
} else if (!hparams.causal_attn) {
|
|
12213
|
-
res = nullptr; // do not extract logits for embedding models such as BERT
|
|
12214
|
-
|
|
12215
|
-
// token or sequence embeddings
|
|
12216
|
-
embd = gf->nodes[gf->n_nodes - 1];
|
|
12217
|
-
|
|
12218
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
|
12219
12525
|
} else if (cparams.embeddings) {
|
|
12220
|
-
|
|
12221
|
-
|
|
12222
|
-
|
|
12223
|
-
|
|
12224
|
-
if (i_embd < 0) { break; }
|
|
12225
|
-
embd = gf->nodes[i_embd];
|
|
12226
|
-
}
|
|
12227
|
-
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
|
12228
|
-
|
|
12229
|
-
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
|
12230
|
-
if (!cparams.causal_attn) {
|
|
12231
|
-
res = nullptr; // do not extract logits when not needed
|
|
12232
|
-
// skip computing logits
|
|
12233
|
-
// TODO: is this safe?
|
|
12234
|
-
gf->n_nodes = i_embd + 1;
|
|
12526
|
+
res = nullptr; // do not extract logits for embedding case
|
|
12527
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
|
12528
|
+
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
|
12529
|
+
embd = gf->nodes[gf->n_nodes - 2];
|
|
12235
12530
|
}
|
|
12531
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
|
12236
12532
|
} else {
|
|
12237
12533
|
embd = nullptr; // do not extract embeddings when not needed
|
|
12238
12534
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
|
12239
12535
|
}
|
|
12240
12536
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
12241
12537
|
|
|
12242
|
-
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
12243
|
-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
|
12244
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
|
12245
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
|
12246
|
-
// with the BLAS calls. need a better solution
|
|
12247
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
|
12248
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
|
12249
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
|
12250
|
-
n_threads = std::min(4, n_threads);
|
|
12251
|
-
}
|
|
12252
|
-
|
|
12253
12538
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
|
12254
12539
|
|
|
12255
12540
|
llama_set_inputs(lctx, u_batch);
|
|
@@ -12312,11 +12597,10 @@ static int llama_decode_internal(
|
|
|
12312
12597
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
|
12313
12598
|
}
|
|
12314
12599
|
} break;
|
|
12315
|
-
case LLAMA_POOLING_TYPE_CLS:
|
|
12316
12600
|
case LLAMA_POOLING_TYPE_MEAN:
|
|
12601
|
+
case LLAMA_POOLING_TYPE_CLS:
|
|
12602
|
+
case LLAMA_POOLING_TYPE_LAST:
|
|
12317
12603
|
{
|
|
12318
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
|
12319
|
-
|
|
12320
12604
|
// extract sequence embeddings
|
|
12321
12605
|
auto & embd_seq_out = lctx.embd_seq;
|
|
12322
12606
|
embd_seq_out.clear();
|
|
@@ -12930,107 +13214,142 @@ struct llm_bigram_bpe {
|
|
|
12930
13214
|
};
|
|
12931
13215
|
|
|
12932
13216
|
struct llm_tokenizer_bpe {
|
|
12933
|
-
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
|
12934
|
-
|
|
12935
|
-
|
|
12936
|
-
|
|
12937
|
-
|
|
12938
|
-
|
|
12939
|
-
|
|
12940
|
-
|
|
12941
|
-
|
|
12942
|
-
|
|
12943
|
-
|
|
12944
|
-
|
|
12945
|
-
|
|
12946
|
-
|
|
12947
|
-
|
|
12948
|
-
|
|
12949
|
-
|
|
12950
|
-
|
|
12951
|
-
|
|
12952
|
-
|
|
12953
|
-
|
|
12954
|
-
|
|
12955
|
-
|
|
12956
|
-
|
|
12957
|
-
|
|
12958
|
-
|
|
12959
|
-
|
|
12960
|
-
|
|
12961
|
-
|
|
12962
|
-
|
|
12963
|
-
|
|
12964
|
-
|
|
12965
|
-
|
|
12966
|
-
|
|
12967
|
-
|
|
12968
|
-
|
|
12969
|
-
|
|
12970
|
-
|
|
12971
|
-
|
|
12972
|
-
|
|
12973
|
-
|
|
12974
|
-
|
|
12975
|
-
|
|
12976
|
-
|
|
12977
|
-
|
|
12978
|
-
|
|
12979
|
-
|
|
12980
|
-
|
|
12981
|
-
|
|
12982
|
-
|
|
12983
|
-
|
|
12984
|
-
|
|
12985
|
-
|
|
12986
|
-
|
|
12987
|
-
|
|
12988
|
-
|
|
12989
|
-
|
|
12990
|
-
|
|
12991
|
-
|
|
12992
|
-
|
|
12993
|
-
|
|
12994
|
-
|
|
12995
|
-
|
|
12996
|
-
|
|
12997
|
-
|
|
12998
|
-
|
|
12999
|
-
|
|
13000
|
-
|
|
13001
|
-
|
|
13002
|
-
|
|
13003
|
-
|
|
13004
|
-
|
|
13005
|
-
|
|
13006
|
-
|
|
13007
|
-
|
|
13008
|
-
|
|
13009
|
-
|
|
13010
|
-
|
|
13011
|
-
|
|
13012
|
-
|
|
13013
|
-
|
|
13014
|
-
|
|
13015
|
-
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
|
13016
|
-
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
13017
|
-
});
|
|
13018
|
-
break;
|
|
13019
|
-
default:
|
|
13020
|
-
// default regex for BPE tokenization pre-processing
|
|
13021
|
-
word_collection = unicode_regex_split(text, {
|
|
13022
|
-
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
|
13023
|
-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
13024
|
-
"\\p{N}+",
|
|
13025
|
-
"[0-9][0-9][0-9]",
|
|
13026
|
-
});
|
|
13027
|
-
break;
|
|
13028
|
-
}
|
|
13217
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
|
13218
|
+
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
|
|
13219
|
+
switch (vocab.type_pre) {
|
|
13220
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
|
13221
|
+
regex_exprs = {
|
|
13222
|
+
// original regex from tokenizer.json
|
|
13223
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
13224
|
+
|
|
13225
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
|
13226
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
13227
|
+
};
|
|
13228
|
+
break;
|
|
13229
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
|
13230
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
|
13231
|
+
regex_exprs = {
|
|
13232
|
+
// same as llama3
|
|
13233
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
13234
|
+
};
|
|
13235
|
+
break;
|
|
13236
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
|
13237
|
+
regex_exprs = {
|
|
13238
|
+
"[\r\n]",
|
|
13239
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
|
13240
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
|
13241
|
+
"\\s+$",
|
|
13242
|
+
"[一-龥ࠀ-一가-]+",
|
|
13243
|
+
"\\p{N}+",
|
|
13244
|
+
};
|
|
13245
|
+
break;
|
|
13246
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
|
13247
|
+
regex_exprs = {
|
|
13248
|
+
"[\r\n]",
|
|
13249
|
+
"\\s?\\p{L}+",
|
|
13250
|
+
"\\s?\\p{P}+",
|
|
13251
|
+
"[一-龥ࠀ-一가-]+",
|
|
13252
|
+
"\\p{N}",
|
|
13253
|
+
};
|
|
13254
|
+
break;
|
|
13255
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
|
13256
|
+
regex_exprs = {
|
|
13257
|
+
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
|
|
13258
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
13259
|
+
"[0-9][0-9][0-9]",
|
|
13260
|
+
};
|
|
13261
|
+
break;
|
|
13262
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
|
13263
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
|
13264
|
+
// the following are close, but not exact. run the following:
|
|
13265
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
|
13266
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
|
13267
|
+
regex_exprs = {
|
|
13268
|
+
"\\s?\\p{L}+",
|
|
13269
|
+
"\\s?\\p{P}+",
|
|
13270
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
13271
|
+
};
|
|
13272
|
+
break;
|
|
13273
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
|
13274
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
|
13275
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
|
13276
|
+
regex_exprs = {
|
|
13277
|
+
"\\p{N}",
|
|
13278
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
13279
|
+
};
|
|
13280
|
+
break;
|
|
13281
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
|
13282
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
|
13283
|
+
regex_exprs = {
|
|
13284
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
13285
|
+
};
|
|
13286
|
+
break;
|
|
13287
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
|
13288
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
|
13289
|
+
regex_exprs = {
|
|
13290
|
+
// original regex from tokenizer.json
|
|
13291
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
|
13292
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
13293
|
+
};
|
|
13294
|
+
break;
|
|
13295
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
|
13296
|
+
regex_exprs = {
|
|
13297
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
|
13298
|
+
};
|
|
13029
13299
|
break;
|
|
13030
13300
|
default:
|
|
13031
|
-
|
|
13301
|
+
// default regex for BPE tokenization pre-processing
|
|
13302
|
+
regex_exprs = {
|
|
13303
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
|
13304
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
13305
|
+
"\\p{N}+",
|
|
13306
|
+
"[0-9][0-9][0-9]",
|
|
13307
|
+
};
|
|
13032
13308
|
break;
|
|
13033
13309
|
}
|
|
13310
|
+
}
|
|
13311
|
+
|
|
13312
|
+
void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
|
|
13313
|
+
output.push_back(token_id);
|
|
13314
|
+
}
|
|
13315
|
+
|
|
13316
|
+
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
|
13317
|
+
if (vocab.tokenizer_add_bos) {
|
|
13318
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
13319
|
+
output.push_back(vocab.special_bos_id);
|
|
13320
|
+
return true;
|
|
13321
|
+
}
|
|
13322
|
+
return false;
|
|
13323
|
+
}
|
|
13324
|
+
|
|
13325
|
+
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
|
13326
|
+
if (vocab.tokenizer_add_eos) {
|
|
13327
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
|
13328
|
+
output.push_back(vocab.special_eos_id);
|
|
13329
|
+
return true;
|
|
13330
|
+
}
|
|
13331
|
+
return false;
|
|
13332
|
+
}
|
|
13333
|
+
|
|
13334
|
+
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
|
|
13335
|
+
if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
|
13336
|
+
LLAMA_LOG_WARN(
|
|
13337
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
13338
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
13339
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13340
|
+
}
|
|
13341
|
+
if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
|
|
13342
|
+
LLAMA_LOG_WARN(
|
|
13343
|
+
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
|
|
13344
|
+
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
|
|
13345
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13346
|
+
}
|
|
13347
|
+
}
|
|
13348
|
+
|
|
13349
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
13350
|
+
int final_prev_index = -1;
|
|
13351
|
+
|
|
13352
|
+
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
|
13034
13353
|
|
|
13035
13354
|
symbols_final.clear();
|
|
13036
13355
|
|
|
@@ -13041,7 +13360,7 @@ struct llm_tokenizer_bpe {
|
|
|
13041
13360
|
int index = 0;
|
|
13042
13361
|
size_t offset = 0;
|
|
13043
13362
|
|
|
13044
|
-
if (
|
|
13363
|
+
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
|
13045
13364
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
|
13046
13365
|
offset = word.size();
|
|
13047
13366
|
}
|
|
@@ -13122,10 +13441,9 @@ struct llm_tokenizer_bpe {
|
|
|
13122
13441
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
|
13123
13442
|
std::string byte_str(1, *j);
|
|
13124
13443
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
|
13125
|
-
if (token_multibyte
|
|
13126
|
-
|
|
13444
|
+
if (token_multibyte != vocab.token_to_id.end()) {
|
|
13445
|
+
output.push_back(token_multibyte->second);
|
|
13127
13446
|
}
|
|
13128
|
-
output.push_back((*token_multibyte).second);
|
|
13129
13447
|
}
|
|
13130
13448
|
} else {
|
|
13131
13449
|
output.push_back((*token).second);
|
|
@@ -13164,6 +13482,8 @@ private:
|
|
|
13164
13482
|
|
|
13165
13483
|
const llama_vocab & vocab;
|
|
13166
13484
|
|
|
13485
|
+
std::vector<std::string> regex_exprs;
|
|
13486
|
+
|
|
13167
13487
|
std::vector<llm_symbol> symbols;
|
|
13168
13488
|
std::vector<llm_symbol> symbols_final;
|
|
13169
13489
|
|
|
@@ -13173,7 +13493,7 @@ private:
|
|
|
13173
13493
|
struct llm_tokenizer_wpm {
|
|
13174
13494
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
|
13175
13495
|
|
|
13176
|
-
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
13496
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
|
|
13177
13497
|
const auto & token_map = vocab.token_to_id;
|
|
13178
13498
|
|
|
13179
13499
|
// normalize and split by whitespace
|
|
@@ -13182,7 +13502,7 @@ struct llm_tokenizer_wpm {
|
|
|
13182
13502
|
// bos token prepended already
|
|
13183
13503
|
|
|
13184
13504
|
// find the longest tokens that form the words
|
|
13185
|
-
for (const std::string &word : words) {
|
|
13505
|
+
for (const std::string & word : words) {
|
|
13186
13506
|
// skip empty words
|
|
13187
13507
|
if (word.size() == 0) {
|
|
13188
13508
|
continue;
|
|
@@ -13199,7 +13519,7 @@ struct llm_tokenizer_wpm {
|
|
|
13199
13519
|
for (int i = 0; i < n; ++i) {
|
|
13200
13520
|
// loop through possible match length
|
|
13201
13521
|
bool match = false;
|
|
13202
|
-
for (int j = n; j > i; j--) {
|
|
13522
|
+
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
|
|
13203
13523
|
auto it = token_map.find(word1.substr(i, j - i));
|
|
13204
13524
|
if (it != token_map.end()) {
|
|
13205
13525
|
output.push_back(it->second);
|
|
@@ -13222,11 +13542,12 @@ struct llm_tokenizer_wpm {
|
|
|
13222
13542
|
}
|
|
13223
13543
|
}
|
|
13224
13544
|
|
|
13225
|
-
|
|
13545
|
+
// TODO: reduce string copies by using cpts_offs array
|
|
13546
|
+
std::vector<std::string> preprocess(const std::string & text) const {
|
|
13226
13547
|
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
|
13227
13548
|
std::vector<std::string> words(1, "");
|
|
13228
13549
|
|
|
13229
|
-
for (const
|
|
13550
|
+
for (const uint32_t cpt : cpts_nfd) {
|
|
13230
13551
|
const auto flags = unicode_cpt_flags(cpt);
|
|
13231
13552
|
|
|
13232
13553
|
if (flags.is_whitespace) {
|
|
@@ -13444,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13444
13765
|
|
|
13445
13766
|
bool is_prev_special = false;
|
|
13446
13767
|
|
|
13447
|
-
if (add_special && vocab.
|
|
13768
|
+
if (add_special && vocab.tokenizer_add_bos) {
|
|
13448
13769
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
13449
13770
|
output.push_back(vocab.special_bos_id);
|
|
13450
13771
|
is_prev_special = true;
|
|
@@ -13454,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13454
13775
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
13455
13776
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
13456
13777
|
|
|
13457
|
-
if (vocab.
|
|
13778
|
+
if (vocab.tokenizer_add_space_prefix) {
|
|
13458
13779
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
|
13459
13780
|
raw_text = " " + raw_text;
|
|
13460
13781
|
}
|
|
@@ -13472,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13472
13793
|
}
|
|
13473
13794
|
}
|
|
13474
13795
|
|
|
13475
|
-
if (add_special && vocab.
|
|
13796
|
+
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
|
13476
13797
|
LLAMA_LOG_WARN(
|
|
13477
13798
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
13478
13799
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
13479
13800
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13480
13801
|
}
|
|
13481
13802
|
|
|
13482
|
-
if (add_special && vocab.
|
|
13803
|
+
if (add_special && vocab.tokenizer_add_eos) {
|
|
13483
13804
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
|
13484
13805
|
output.push_back(vocab.special_eos_id);
|
|
13485
13806
|
}
|
|
13486
13807
|
} break;
|
|
13487
13808
|
case LLAMA_VOCAB_TYPE_BPE:
|
|
13488
13809
|
{
|
|
13489
|
-
|
|
13490
|
-
|
|
13491
|
-
|
|
13810
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
|
13811
|
+
|
|
13812
|
+
if (add_special) {
|
|
13813
|
+
tokenizer.append_bos(output);
|
|
13492
13814
|
}
|
|
13493
13815
|
|
|
13494
13816
|
for (const auto & fragment : fragment_buffer) {
|
|
@@ -13498,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13498
13820
|
#ifdef PRETOKENIZERDEBUG
|
|
13499
13821
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
|
13500
13822
|
#endif
|
|
13501
|
-
llm_tokenizer_bpe tokenizer(vocab);
|
|
13502
13823
|
tokenizer.tokenize(raw_text, output);
|
|
13503
13824
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
13504
|
-
|
|
13825
|
+
tokenizer.append(fragment.token, output);
|
|
13505
13826
|
}
|
|
13506
13827
|
}
|
|
13507
13828
|
|
|
13508
|
-
if (add_special
|
|
13509
|
-
|
|
13510
|
-
|
|
13511
|
-
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
13512
|
-
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13513
|
-
}
|
|
13514
|
-
|
|
13515
|
-
if (add_special && vocab.special_add_eos == 1) {
|
|
13516
|
-
GGML_ASSERT(vocab.special_add_eos != -1);
|
|
13517
|
-
output.push_back(vocab.special_eos_id);
|
|
13829
|
+
if (add_special) {
|
|
13830
|
+
tokenizer.append_eos(output);
|
|
13831
|
+
tokenizer.check_double_bos_eos(output);
|
|
13518
13832
|
}
|
|
13519
13833
|
} break;
|
|
13520
13834
|
case LLAMA_VOCAB_TYPE_WPM:
|
|
@@ -13524,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13524
13838
|
output.push_back(vocab.special_cls_id);
|
|
13525
13839
|
}
|
|
13526
13840
|
|
|
13841
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
|
13842
|
+
|
|
13527
13843
|
for (const auto & fragment : fragment_buffer) {
|
|
13528
13844
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
13529
13845
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
@@ -13531,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13531
13847
|
#ifdef PRETOKENIZERDEBUG
|
|
13532
13848
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
|
13533
13849
|
#endif
|
|
13534
|
-
llm_tokenizer_wpm tokenizer(vocab);
|
|
13535
13850
|
tokenizer.tokenize(raw_text, output);
|
|
13536
13851
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
13537
13852
|
output.push_back(fragment.token);
|
|
@@ -13631,7 +13946,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
|
13631
13946
|
const uint32_t chr) {
|
|
13632
13947
|
|
|
13633
13948
|
bool found = false;
|
|
13634
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
13949
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
|
13635
13950
|
|
|
13636
13951
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
|
13637
13952
|
|
|
@@ -13640,6 +13955,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
|
13640
13955
|
// inclusive range, e.g. [a-z]
|
|
13641
13956
|
found = found || (pos->value <= chr && chr <= pos[1].value);
|
|
13642
13957
|
pos += 2;
|
|
13958
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
|
13959
|
+
// Any character matches "."
|
|
13960
|
+
found = true;
|
|
13961
|
+
pos += 1;
|
|
13643
13962
|
} else {
|
|
13644
13963
|
// exact char match, e.g. [a] or "a"
|
|
13645
13964
|
found = found || pos->value == chr;
|
|
@@ -13657,7 +13976,7 @@ static bool llama_grammar_match_partial_char(
|
|
|
13657
13976
|
const llama_grammar_element * pos,
|
|
13658
13977
|
const llama_partial_utf8 partial_utf8) {
|
|
13659
13978
|
|
|
13660
|
-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
13979
|
+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
|
13661
13980
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
|
13662
13981
|
|
|
13663
13982
|
uint32_t partial_value = partial_utf8.value;
|
|
@@ -13687,6 +14006,9 @@ static bool llama_grammar_match_partial_char(
|
|
|
13687
14006
|
return is_positive_char;
|
|
13688
14007
|
}
|
|
13689
14008
|
pos += 2;
|
|
14009
|
+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
|
14010
|
+
// Any character matches "."
|
|
14011
|
+
return true;
|
|
13690
14012
|
} else {
|
|
13691
14013
|
// exact char match, e.g. [a] or "a"
|
|
13692
14014
|
if (low <= pos->value && pos->value <= high) {
|
|
@@ -13747,6 +14069,7 @@ static void llama_grammar_advance_stack(
|
|
|
13747
14069
|
}
|
|
13748
14070
|
case LLAMA_GRETYPE_CHAR:
|
|
13749
14071
|
case LLAMA_GRETYPE_CHAR_NOT:
|
|
14072
|
+
case LLAMA_GRETYPE_CHAR_ANY:
|
|
13750
14073
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
|
13751
14074
|
// only add the stack if it's not a duplicate of one we already have
|
|
13752
14075
|
new_stacks.emplace_back(stack);
|
|
@@ -15220,6 +15543,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
15220
15543
|
if (imatrix_data) {
|
|
15221
15544
|
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
|
15222
15545
|
qs.has_imatrix = true;
|
|
15546
|
+
// check imatrix for nans or infs
|
|
15547
|
+
for (const auto & kv : *imatrix_data) {
|
|
15548
|
+
for (float f : kv.second) {
|
|
15549
|
+
if (!std::isfinite(f)) {
|
|
15550
|
+
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
|
|
15551
|
+
}
|
|
15552
|
+
}
|
|
15553
|
+
}
|
|
15223
15554
|
}
|
|
15224
15555
|
}
|
|
15225
15556
|
|
|
@@ -16024,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16024
16355
|
params.flash_attn = false;
|
|
16025
16356
|
}
|
|
16026
16357
|
|
|
16358
|
+
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
|
|
16359
|
+
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
|
|
16360
|
+
params.flash_attn = false;
|
|
16361
|
+
}
|
|
16362
|
+
|
|
16027
16363
|
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
|
16028
16364
|
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
|
16029
16365
|
return nullptr;
|
|
@@ -16195,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16195
16531
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
16196
16532
|
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
|
16197
16533
|
if (backend == nullptr) {
|
|
16198
|
-
|
|
16199
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
|
16534
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
|
16200
16535
|
llama_free(ctx);
|
|
16201
16536
|
return nullptr;
|
|
16202
16537
|
}
|
|
@@ -16226,6 +16561,16 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16226
16561
|
ctx->backends.push_back(backend);
|
|
16227
16562
|
}
|
|
16228
16563
|
#endif
|
|
16564
|
+
|
|
16565
|
+
#ifdef GGML_USE_BLAS
|
|
16566
|
+
ctx->backend_blas = ggml_backend_blas_init();
|
|
16567
|
+
if (ctx->backend_blas == nullptr) {
|
|
16568
|
+
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
|
16569
|
+
} else {
|
|
16570
|
+
ctx->backends.push_back(ctx->backend_blas);
|
|
16571
|
+
}
|
|
16572
|
+
#endif
|
|
16573
|
+
|
|
16229
16574
|
#if defined(GGML_USE_RPC)
|
|
16230
16575
|
if (model->n_gpu_layers > 0) {
|
|
16231
16576
|
for (const auto & endpoint : model->rpc_servers) {
|
|
@@ -17814,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
|
|
|
17814
18159
|
ctx->abort_callback_data = abort_callback_data;
|
|
17815
18160
|
}
|
|
17816
18161
|
|
|
18162
|
+
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
|
18163
|
+
ctx->cparams.embeddings = embeddings;
|
|
18164
|
+
}
|
|
18165
|
+
|
|
17817
18166
|
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
|
17818
18167
|
ctx->cparams.causal_attn = causal_attn;
|
|
17819
18168
|
}
|
|
@@ -18057,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
|
18057
18406
|
}
|
|
18058
18407
|
|
|
18059
18408
|
int32_t llama_add_bos_token(const struct llama_model * model) {
|
|
18060
|
-
return model->vocab.
|
|
18409
|
+
return model->vocab.tokenizer_add_bos;
|
|
18061
18410
|
}
|
|
18062
18411
|
|
|
18063
18412
|
int32_t llama_add_eos_token(const struct llama_model * model) {
|
|
18064
|
-
return model->vocab.
|
|
18413
|
+
return model->vocab.tokenizer_add_eos;
|
|
18065
18414
|
}
|
|
18066
18415
|
|
|
18067
18416
|
llama_token llama_token_prefix(const struct llama_model * model) {
|