llama_cpp 0.15.3 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +27 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +66 -36
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
- data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
- data/vendor/tmp/llama.cpp/ggml.c +301 -409
- data/vendor/tmp/llama.cpp/ggml.h +19 -23
- data/vendor/tmp/llama.cpp/llama.cpp +855 -651
- data/vendor/tmp/llama.cpp/llama.h +28 -48
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -13,8 +13,6 @@
|
|
13
13
|
|
14
14
|
#ifdef GGML_USE_CUDA
|
15
15
|
# include "ggml-cuda.h"
|
16
|
-
#elif defined(GGML_USE_CLBLAST)
|
17
|
-
# include "ggml-opencl.h"
|
18
16
|
#elif defined(GGML_USE_VULKAN)
|
19
17
|
# include "ggml-vulkan.h"
|
20
18
|
#elif defined(GGML_USE_SYCL)
|
@@ -103,14 +101,14 @@
|
|
103
101
|
#endif
|
104
102
|
|
105
103
|
#define LLAMA_MAX_NODES 8192
|
106
|
-
#define LLAMA_MAX_EXPERTS
|
104
|
+
#define LLAMA_MAX_EXPERTS 160
|
107
105
|
|
108
106
|
//
|
109
107
|
// logging
|
110
108
|
//
|
111
109
|
|
112
110
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
113
|
-
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
111
|
+
static void llama_log_internal (ggml_log_level level, const char * format, ...);
|
114
112
|
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
115
113
|
|
116
114
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
@@ -222,6 +220,7 @@ enum llm_arch {
|
|
222
220
|
LLM_ARCH_DBRX,
|
223
221
|
LLM_ARCH_OLMO,
|
224
222
|
LLM_ARCH_ARCTIC,
|
223
|
+
LLM_ARCH_DEEPSEEK2,
|
225
224
|
LLM_ARCH_UNKNOWN,
|
226
225
|
};
|
227
226
|
|
@@ -259,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
259
258
|
{ LLM_ARCH_DBRX, "dbrx" },
|
260
259
|
{ LLM_ARCH_OLMO, "olmo" },
|
261
260
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
261
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
262
262
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
263
263
|
};
|
264
264
|
|
@@ -279,11 +279,15 @@ enum llm_kv {
|
|
279
279
|
LLM_KV_CONTEXT_LENGTH,
|
280
280
|
LLM_KV_EMBEDDING_LENGTH,
|
281
281
|
LLM_KV_BLOCK_COUNT,
|
282
|
+
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
282
283
|
LLM_KV_FEED_FORWARD_LENGTH,
|
284
|
+
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
283
285
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
284
286
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
285
287
|
LLM_KV_EXPERT_COUNT,
|
286
288
|
LLM_KV_EXPERT_USED_COUNT,
|
289
|
+
LLM_KV_EXPERT_SHARED_COUNT,
|
290
|
+
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
287
291
|
LLM_KV_POOLING_TYPE,
|
288
292
|
LLM_KV_LOGIT_SCALE,
|
289
293
|
|
@@ -296,6 +300,8 @@ enum llm_kv {
|
|
296
300
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
297
301
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
298
302
|
LLM_KV_ATTENTION_CAUSAL,
|
303
|
+
LLM_KV_ATTENTION_Q_LORA_RANK,
|
304
|
+
LLM_KV_ATTENTION_KV_LORA_RANK,
|
299
305
|
|
300
306
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
301
307
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -305,6 +311,7 @@ enum llm_kv {
|
|
305
311
|
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
306
312
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
307
313
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
314
|
+
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
308
315
|
|
309
316
|
LLM_KV_SPLIT_NO,
|
310
317
|
LLM_KV_SPLIT_COUNT,
|
@@ -353,17 +360,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
353
360
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
354
361
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
355
362
|
|
356
|
-
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size"
|
357
|
-
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length"
|
358
|
-
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length"
|
359
|
-
{ LLM_KV_BLOCK_COUNT, "%s.block_count"
|
360
|
-
{
|
361
|
-
{
|
362
|
-
{
|
363
|
-
{
|
364
|
-
{
|
365
|
-
{
|
366
|
-
{
|
363
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
364
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
365
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
366
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
367
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
368
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
369
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
370
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
371
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
372
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
373
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
374
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
375
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
376
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
377
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
367
378
|
|
368
379
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
369
380
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -374,6 +385,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
374
385
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
375
386
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
376
387
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
388
|
+
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
389
|
+
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
377
390
|
|
378
391
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
379
392
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -383,6 +396,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
383
396
|
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
384
397
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
385
398
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
399
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
386
400
|
|
387
401
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
388
402
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
@@ -474,6 +488,12 @@ enum llm_tensor {
|
|
474
488
|
LLM_TENSOR_SSM_A,
|
475
489
|
LLM_TENSOR_SSM_D,
|
476
490
|
LLM_TENSOR_SSM_OUT,
|
491
|
+
LLM_TENSOR_ATTN_Q_A,
|
492
|
+
LLM_TENSOR_ATTN_Q_B,
|
493
|
+
LLM_TENSOR_ATTN_KV_A_MQA,
|
494
|
+
LLM_TENSOR_ATTN_KV_B,
|
495
|
+
LLM_TENSOR_ATTN_Q_A_NORM,
|
496
|
+
LLM_TENSOR_ATTN_KV_A_NORM,
|
477
497
|
};
|
478
498
|
|
479
499
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -1057,6 +1077,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1057
1077
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1058
1078
|
},
|
1059
1079
|
},
|
1080
|
+
{
|
1081
|
+
LLM_ARCH_DEEPSEEK2,
|
1082
|
+
{
|
1083
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1084
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1085
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1086
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1087
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
1088
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
1089
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1090
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
1091
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1092
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1093
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1094
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1095
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1096
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1097
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1098
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1099
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1100
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1101
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1102
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1103
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1104
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1105
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1106
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1107
|
+
},
|
1108
|
+
},
|
1060
1109
|
{
|
1061
1110
|
LLM_ARCH_UNKNOWN,
|
1062
1111
|
{
|
@@ -1651,12 +1700,13 @@ struct llama_mlock {
|
|
1651
1700
|
};
|
1652
1701
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1653
1702
|
|
1654
|
-
|
1703
|
+
// NOTE: avoid ever using this except for building the token_to_piece caches
|
1704
|
+
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
1655
1705
|
std::vector<char> result(8, 0);
|
1656
|
-
const int n_tokens = llama_token_to_piece(
|
1706
|
+
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1657
1707
|
if (n_tokens < 0) {
|
1658
1708
|
result.resize(-n_tokens);
|
1659
|
-
int check = llama_token_to_piece(
|
1709
|
+
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1660
1710
|
GGML_ASSERT(check == -n_tokens);
|
1661
1711
|
}
|
1662
1712
|
else {
|
@@ -1741,6 +1791,7 @@ enum e_model {
|
|
1741
1791
|
MODEL_13B,
|
1742
1792
|
MODEL_14B,
|
1743
1793
|
MODEL_15B,
|
1794
|
+
MODEL_16B,
|
1744
1795
|
MODEL_20B,
|
1745
1796
|
MODEL_30B,
|
1746
1797
|
MODEL_34B,
|
@@ -1748,6 +1799,7 @@ enum e_model {
|
|
1748
1799
|
MODEL_40B,
|
1749
1800
|
MODEL_65B,
|
1750
1801
|
MODEL_70B,
|
1802
|
+
MODEL_236B,
|
1751
1803
|
MODEL_314B,
|
1752
1804
|
MODEL_SMALL,
|
1753
1805
|
MODEL_MEDIUM,
|
@@ -1783,13 +1835,21 @@ struct llama_hparams {
|
|
1783
1835
|
uint32_t n_expert_used = 0;
|
1784
1836
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1785
1837
|
|
1838
|
+
uint32_t n_layer_dense_lead = 0;
|
1839
|
+
uint32_t n_lora_q = 0;
|
1840
|
+
uint32_t n_lora_kv = 0;
|
1841
|
+
uint32_t n_ff_exp = 0;
|
1842
|
+
uint32_t n_expert_shared = 0;
|
1843
|
+
float expert_weights_scale = 0.0;
|
1844
|
+
|
1786
1845
|
float f_norm_eps;
|
1787
1846
|
float f_norm_rms_eps;
|
1788
1847
|
|
1789
1848
|
float rope_attn_factor = 1.0f;
|
1790
1849
|
float rope_freq_base_train;
|
1791
1850
|
float rope_freq_scale_train;
|
1792
|
-
uint32_t
|
1851
|
+
uint32_t n_ctx_orig_yarn;
|
1852
|
+
float rope_yarn_log_mul;
|
1793
1853
|
|
1794
1854
|
// for State Space Models
|
1795
1855
|
uint32_t ssm_d_conv = 0;
|
@@ -1823,8 +1883,14 @@ struct llama_hparams {
|
|
1823
1883
|
if (this->n_expert != other.n_expert) return true;
|
1824
1884
|
if (this->n_expert_used != other.n_expert_used) return true;
|
1825
1885
|
|
1886
|
+
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
1887
|
+
if (this->n_lora_q != other.n_lora_q) return true;
|
1888
|
+
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1889
|
+
if (this->n_ff_exp != other.n_ff_exp) return true;
|
1890
|
+
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1891
|
+
|
1826
1892
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1827
|
-
if (this->
|
1893
|
+
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
1828
1894
|
|
1829
1895
|
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
1830
1896
|
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
@@ -1838,6 +1904,8 @@ struct llama_hparams {
|
|
1838
1904
|
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1839
1905
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1840
1906
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1907
|
+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
1908
|
+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
1841
1909
|
|
1842
1910
|
return false;
|
1843
1911
|
}
|
@@ -1881,7 +1949,7 @@ struct llama_cparams {
|
|
1881
1949
|
float rope_freq_base;
|
1882
1950
|
float rope_freq_scale;
|
1883
1951
|
|
1884
|
-
uint32_t
|
1952
|
+
uint32_t n_ctx_orig_yarn;
|
1885
1953
|
// These hyperparameters are not exposed in GGUF, because all
|
1886
1954
|
// existing YaRN models use the same values for them.
|
1887
1955
|
float yarn_ext_factor;
|
@@ -1913,6 +1981,8 @@ struct llama_layer {
|
|
1913
1981
|
struct ggml_tensor * attn_k_norm_b;
|
1914
1982
|
struct ggml_tensor * attn_out_norm;
|
1915
1983
|
struct ggml_tensor * attn_out_norm_b;
|
1984
|
+
struct ggml_tensor * attn_q_a_norm;
|
1985
|
+
struct ggml_tensor * attn_kv_a_norm;
|
1916
1986
|
|
1917
1987
|
// attention
|
1918
1988
|
struct ggml_tensor * wq;
|
@@ -1920,6 +1990,10 @@ struct llama_layer {
|
|
1920
1990
|
struct ggml_tensor * wv;
|
1921
1991
|
struct ggml_tensor * wo;
|
1922
1992
|
struct ggml_tensor * wqkv;
|
1993
|
+
struct ggml_tensor * wq_a;
|
1994
|
+
struct ggml_tensor * wq_b;
|
1995
|
+
struct ggml_tensor * wkv_a_mqa;
|
1996
|
+
struct ggml_tensor * wkv_b;
|
1923
1997
|
|
1924
1998
|
// attention bias
|
1925
1999
|
struct ggml_tensor * bq;
|
@@ -1953,8 +2027,9 @@ struct llama_layer {
|
|
1953
2027
|
struct ggml_tensor * ffn_up_shexp;
|
1954
2028
|
|
1955
2029
|
// ff bias
|
1956
|
-
struct ggml_tensor *
|
1957
|
-
struct ggml_tensor *
|
2030
|
+
struct ggml_tensor * ffn_gate_b = nullptr;
|
2031
|
+
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
2032
|
+
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
1958
2033
|
struct ggml_tensor * ffn_act;
|
1959
2034
|
|
1960
2035
|
// mamba proj
|
@@ -2072,12 +2147,12 @@ struct llama_control_vector {
|
|
2072
2147
|
struct llama_vocab {
|
2073
2148
|
using id = int32_t;
|
2074
2149
|
using token = std::string;
|
2075
|
-
using
|
2150
|
+
using tattr = llama_token_attr;
|
2076
2151
|
|
2077
2152
|
struct token_data {
|
2078
2153
|
token text;
|
2079
2154
|
float score;
|
2080
|
-
|
2155
|
+
tattr attr;
|
2081
2156
|
};
|
2082
2157
|
|
2083
2158
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
@@ -2086,7 +2161,8 @@ struct llama_vocab {
|
|
2086
2161
|
std::unordered_map<token, id> token_to_id;
|
2087
2162
|
std::vector<token_data> id_to_token;
|
2088
2163
|
|
2089
|
-
std::
|
2164
|
+
std::vector<id> cache_special_tokens;
|
2165
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
2090
2166
|
|
2091
2167
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2092
2168
|
|
@@ -2293,13 +2369,34 @@ struct llama_context {
|
|
2293
2369
|
struct llama_control_vector cvec;
|
2294
2370
|
};
|
2295
2371
|
|
2372
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
2373
|
+
size_t count = 1;
|
2374
|
+
#if defined(GGML_USE_CUDA)
|
2375
|
+
count = ggml_backend_cuda_get_device_count();
|
2376
|
+
#elif defined(GGML_USE_SYCL)
|
2377
|
+
count = ggml_backend_sycl_get_device_count();
|
2378
|
+
#elif defined(GGML_USE_VULKAN)
|
2379
|
+
count = ggml_backend_vk_get_device_count();
|
2380
|
+
#endif
|
2381
|
+
#if defined(GGML_USE_RPC)
|
2382
|
+
count += model.rpc_servers.size();
|
2383
|
+
#endif
|
2384
|
+
return count;
|
2385
|
+
GGML_UNUSED(model);
|
2386
|
+
}
|
2387
|
+
|
2296
2388
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
2297
2389
|
ggml_backend_buffer_type_t buft = nullptr;
|
2298
2390
|
|
2299
|
-
#
|
2300
|
-
|
2301
|
-
|
2302
|
-
|
2391
|
+
#if defined(GGML_USE_RPC)
|
2392
|
+
int dev_count = (int)llama_get_device_count(model);
|
2393
|
+
int rpc_count = (int)model.rpc_servers.size();
|
2394
|
+
if (gpu >= dev_count - rpc_count) {
|
2395
|
+
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
2396
|
+
return ggml_backend_rpc_buffer_type(endpoint);
|
2397
|
+
}
|
2398
|
+
#endif
|
2399
|
+
#if defined(GGML_USE_METAL)
|
2303
2400
|
buft = ggml_backend_metal_buffer_type();
|
2304
2401
|
#elif defined(GGML_USE_CUDA)
|
2305
2402
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
@@ -2307,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|
2307
2404
|
buft = ggml_backend_vk_buffer_type(gpu);
|
2308
2405
|
#elif defined(GGML_USE_SYCL)
|
2309
2406
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
2310
|
-
#elif defined(GGML_USE_CLBLAST)
|
2311
|
-
buft = ggml_backend_opencl_buffer_type();
|
2312
2407
|
#elif defined(GGML_USE_KOMPUTE)
|
2313
2408
|
buft = ggml_backend_kompute_buffer_type(gpu);
|
2314
2409
|
if (buft == nullptr) {
|
@@ -2347,29 +2442,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|
2347
2442
|
GGML_UNUSED(tensor_split);
|
2348
2443
|
}
|
2349
2444
|
|
2350
|
-
static size_t llama_get_device_count(const llama_model & model) {
|
2351
|
-
#if defined(GGML_USE_RPC)
|
2352
|
-
return model.rpc_servers.size();
|
2353
|
-
#elif defined(GGML_USE_CUDA)
|
2354
|
-
return ggml_backend_cuda_get_device_count();
|
2355
|
-
#elif defined(GGML_USE_SYCL)
|
2356
|
-
return ggml_backend_sycl_get_device_count();
|
2357
|
-
#elif defined(GGML_USE_VULKAN)
|
2358
|
-
return ggml_backend_vk_get_device_count();
|
2359
|
-
#else
|
2360
|
-
return 1;
|
2361
|
-
#endif
|
2362
|
-
GGML_UNUSED(model);
|
2363
|
-
}
|
2364
|
-
|
2365
2445
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
2366
2446
|
#if defined(GGML_USE_RPC)
|
2367
|
-
|
2368
|
-
|
2369
|
-
|
2370
|
-
|
2371
|
-
|
2372
|
-
|
2447
|
+
int dev_count = (int)llama_get_device_count(model);
|
2448
|
+
int rpc_count = (int)model.rpc_servers.size();
|
2449
|
+
if (device >= dev_count - rpc_count) {
|
2450
|
+
size_t total;
|
2451
|
+
size_t free;
|
2452
|
+
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
2453
|
+
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
2454
|
+
return free;
|
2455
|
+
}
|
2456
|
+
#endif
|
2457
|
+
#if defined(GGML_USE_CUDA)
|
2373
2458
|
size_t total;
|
2374
2459
|
size_t free;
|
2375
2460
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
@@ -2441,10 +2526,6 @@ static bool llama_kv_cache_init(
|
|
2441
2526
|
}
|
2442
2527
|
}
|
2443
2528
|
|
2444
|
-
#ifdef GGML_USE_CLBLAST
|
2445
|
-
offload = false;
|
2446
|
-
#endif
|
2447
|
-
|
2448
2529
|
// count used buffer types
|
2449
2530
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
2450
2531
|
if (offload) {
|
@@ -3832,6 +3913,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3832
3913
|
case MODEL_13B: return "13B";
|
3833
3914
|
case MODEL_14B: return "14B";
|
3834
3915
|
case MODEL_15B: return "15B";
|
3916
|
+
case MODEL_16B: return "16B";
|
3835
3917
|
case MODEL_20B: return "20B";
|
3836
3918
|
case MODEL_30B: return "30B";
|
3837
3919
|
case MODEL_34B: return "34B";
|
@@ -3839,6 +3921,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3839
3921
|
case MODEL_40B: return "40B";
|
3840
3922
|
case MODEL_65B: return "65B";
|
3841
3923
|
case MODEL_70B: return "70B";
|
3924
|
+
case MODEL_236B: return "236B";
|
3842
3925
|
case MODEL_314B: return "314B";
|
3843
3926
|
case MODEL_SMALL: return "0.1B";
|
3844
3927
|
case MODEL_MEDIUM: return "0.4B";
|
@@ -3922,8 +4005,8 @@ static void llm_load_hparams(
|
|
3922
4005
|
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
3923
4006
|
hparams.rope_finetuned = rope_finetuned;
|
3924
4007
|
|
3925
|
-
hparams.
|
3926
|
-
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.
|
4008
|
+
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
4009
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
3927
4010
|
|
3928
4011
|
// rope_freq_base (optional)
|
3929
4012
|
hparams.rope_freq_base_train = 10000.0f;
|
@@ -3981,7 +4064,9 @@ static void llm_load_hparams(
|
|
3981
4064
|
switch (hparams.n_layer) {
|
3982
4065
|
case 22: model.type = e_model::MODEL_1B; break;
|
3983
4066
|
case 26: model.type = e_model::MODEL_3B; break;
|
3984
|
-
|
4067
|
+
// granite uses a vocab with len 49152
|
4068
|
+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
4069
|
+
case 36: model.type = e_model::MODEL_8B; break; // granite
|
3985
4070
|
case 40: model.type = e_model::MODEL_13B; break;
|
3986
4071
|
case 48: model.type = e_model::MODEL_34B; break;
|
3987
4072
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -4251,6 +4336,8 @@ static void llm_load_hparams(
|
|
4251
4336
|
case 30: model.type = e_model::MODEL_3B; break;
|
4252
4337
|
case 32: model.type = e_model::MODEL_7B; break;
|
4253
4338
|
case 40: model.type = e_model::MODEL_15B; break;
|
4339
|
+
case 52: model.type = e_model::MODEL_20B; break; // granite
|
4340
|
+
case 88: model.type = e_model::MODEL_34B; break; // granite
|
4254
4341
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4255
4342
|
}
|
4256
4343
|
} break;
|
@@ -4384,6 +4471,26 @@ static void llm_load_hparams(
|
|
4384
4471
|
model.type = e_model::MODEL_UNKNOWN;
|
4385
4472
|
}
|
4386
4473
|
} break;
|
4474
|
+
case LLM_ARCH_DEEPSEEK2:
|
4475
|
+
{
|
4476
|
+
bool is_lite = (hparams.n_layer == 27);
|
4477
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4478
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
4479
|
+
if (!is_lite) {
|
4480
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
4481
|
+
}
|
4482
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
4483
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
4484
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
4485
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
4486
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
4487
|
+
|
4488
|
+
switch (hparams.n_layer) {
|
4489
|
+
case 27: model.type = e_model::MODEL_16B; break;
|
4490
|
+
case 60: model.type = e_model::MODEL_236B; break;
|
4491
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4492
|
+
}
|
4493
|
+
} break;
|
4387
4494
|
default: (void)0;
|
4388
4495
|
}
|
4389
4496
|
|
@@ -4490,15 +4597,14 @@ static void llm_load_vocab(
|
|
4490
4597
|
vocab.special_cls_id = 101;
|
4491
4598
|
vocab.special_mask_id = 103;
|
4492
4599
|
vocab.add_space_prefix = false;
|
4493
|
-
} else {
|
4494
|
-
|
4495
|
-
|
4496
|
-
|
4497
|
-
|
4498
|
-
|
4499
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4500
|
-
return;
|
4600
|
+
} else if (tokenizer_model == "gpt2") {
|
4601
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4602
|
+
|
4603
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4604
|
+
if (add_space_prefix_keyidx != -1) {
|
4605
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4501
4606
|
}
|
4607
|
+
|
4502
4608
|
// read bpe merges and populate bpe ranks
|
4503
4609
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4504
4610
|
if (merges_keyidx == -1) {
|
@@ -4532,6 +4638,8 @@ static void llm_load_vocab(
|
|
4532
4638
|
vocab.special_pad_id = -1;
|
4533
4639
|
vocab.special_cls_id = -1;
|
4534
4640
|
vocab.special_mask_id = -1;
|
4641
|
+
} else {
|
4642
|
+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
4535
4643
|
}
|
4536
4644
|
|
4537
4645
|
// for now, only BPE models have pre-tokenizers
|
@@ -4593,6 +4701,9 @@ static void llm_load_vocab(
|
|
4593
4701
|
} else if (
|
4594
4702
|
tokenizer_pre == "dbrx") {
|
4595
4703
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4704
|
+
} else if (
|
4705
|
+
tokenizer_pre == "smaug-bpe") {
|
4706
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4596
4707
|
} else {
|
4597
4708
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4598
4709
|
}
|
@@ -4631,7 +4742,20 @@ static void llm_load_vocab(
|
|
4631
4742
|
auto & token_data = vocab.id_to_token[i];
|
4632
4743
|
token_data.text = std::move(word);
|
4633
4744
|
token_data.score = scores ? scores[i] : 0.0f;
|
4634
|
-
token_data.
|
4745
|
+
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
4746
|
+
|
4747
|
+
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
4748
|
+
switch(toktypes[i]) {
|
4749
|
+
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
4750
|
+
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
4751
|
+
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
4752
|
+
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
4753
|
+
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
4754
|
+
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
4755
|
+
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
4756
|
+
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
4757
|
+
}
|
4758
|
+
}
|
4635
4759
|
}
|
4636
4760
|
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
4637
4761
|
|
@@ -4721,96 +4845,88 @@ static void llm_load_vocab(
|
|
4721
4845
|
|
4722
4846
|
// build special tokens cache
|
4723
4847
|
{
|
4724
|
-
|
4725
|
-
|
4726
|
-
|
4727
|
-
|
4728
|
-
|
4729
|
-
// From testing, this appears to correlate 1:1 with special tokens.
|
4730
|
-
//
|
4848
|
+
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
4849
|
+
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
4850
|
+
vocab.cache_special_tokens.push_back(id);
|
4851
|
+
}
|
4852
|
+
}
|
4731
4853
|
|
4732
|
-
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4736
|
-
|
4854
|
+
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
4855
|
+
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
4856
|
+
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
4857
|
+
}
|
4858
|
+
);
|
4737
4859
|
|
4738
|
-
|
4860
|
+
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
4861
|
+
}
|
4739
4862
|
|
4740
|
-
|
4741
|
-
|
4742
|
-
|
4863
|
+
// build token to piece cache
|
4864
|
+
{
|
4865
|
+
size_t size_cache = 0;
|
4743
4866
|
|
4744
|
-
|
4745
|
-
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
4746
|
-
special_tokens_count_by_type++;
|
4747
|
-
}
|
4867
|
+
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
|
4748
4868
|
|
4749
|
-
|
4750
|
-
|
4751
|
-
bool is_tokenizable = false;
|
4869
|
+
for (uint32_t id = 0; id < n_vocab; ++id) {
|
4870
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
|
4752
4871
|
|
4753
|
-
|
4754
|
-
|
4755
|
-
for (unsigned i = 1; i < token.length();) {
|
4756
|
-
const auto left = token.substr(0, i);
|
4757
|
-
const auto right = token.substr(i);
|
4872
|
+
size_cache += cache_token_to_piece[id].size();
|
4873
|
+
}
|
4758
4874
|
|
4759
|
-
|
4760
|
-
auto utf = utf8_len(left.at(left.length() - 1));
|
4875
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
4761
4876
|
|
4762
|
-
|
4763
|
-
|
4764
|
-
|
4765
|
-
|
4766
|
-
|
4767
|
-
|
4768
|
-
|
4769
|
-
|
4770
|
-
|
4771
|
-
|
4772
|
-
|
4877
|
+
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
4878
|
+
}
|
4879
|
+
|
4880
|
+
// Handle per token attributes
|
4881
|
+
//NOTE: Each model customizes per token attributes.
|
4882
|
+
//NOTE: Per token attributes are missing from the GGUF file.
|
4883
|
+
//TODO: Extract attributes from GGUF file.
|
4884
|
+
{
|
4885
|
+
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
4886
|
+
for (auto substr : substrs) {
|
4887
|
+
if (str.find(substr) < std::string::npos) {
|
4888
|
+
return true;
|
4773
4889
|
}
|
4890
|
+
}
|
4891
|
+
return false;
|
4892
|
+
};
|
4774
4893
|
|
4775
|
-
|
4776
|
-
|
4777
|
-
|
4894
|
+
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
4895
|
+
uint32_t current = vocab.id_to_token.at(id).attr;
|
4896
|
+
current = value ? (current | attr) : (current & ~attr);
|
4897
|
+
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
4898
|
+
};
|
4778
4899
|
|
4779
|
-
|
4780
|
-
|
4781
|
-
|
4782
|
-
utf8_str_len++;
|
4783
|
-
i += utf8_len(token.at(i));
|
4784
|
-
}
|
4900
|
+
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
4901
|
+
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
4902
|
+
};
|
4785
4903
|
|
4786
|
-
|
4787
|
-
|
4788
|
-
// At this point what we have left are special tokens only
|
4789
|
-
vocab.special_tokens_cache[token] = id;
|
4904
|
+
std::string model_name;
|
4905
|
+
std::string tokenizer_pre;
|
4790
4906
|
|
4791
|
-
|
4792
|
-
|
4907
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
4908
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4793
4909
|
|
4794
|
-
|
4795
|
-
|
4796
|
-
|
4797
|
-
|
4798
|
-
}
|
4799
|
-
}
|
4910
|
+
// model name to lowercase
|
4911
|
+
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
4912
|
+
[] (const std::string::value_type x) {
|
4913
|
+
return std::tolower(x);
|
4800
4914
|
}
|
4801
|
-
|
4915
|
+
);
|
4802
4916
|
|
4803
|
-
|
4804
|
-
|
4805
|
-
|
4806
|
-
|
4807
|
-
|
4808
|
-
|
4809
|
-
|
4810
|
-
|
4811
|
-
|
4812
|
-
|
4813
|
-
)
|
4917
|
+
// set attributes by model/tokenizer name
|
4918
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
4919
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
4920
|
+
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
4921
|
+
for (auto id : vocab.cache_special_tokens) {
|
4922
|
+
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
4923
|
+
}
|
4924
|
+
for (auto token : {"</s>"}) {
|
4925
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
4926
|
+
}
|
4927
|
+
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
4928
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
4929
|
+
}
|
4814
4930
|
}
|
4815
4931
|
}
|
4816
4932
|
}
|
@@ -4852,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4852
4968
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
4853
4969
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
4854
4970
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
4855
|
-
LLAMA_LOG_INFO("%s:
|
4971
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
4856
4972
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
4857
4973
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
4858
4974
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
@@ -4892,6 +5008,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4892
5008
|
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4893
5009
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4894
5010
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
5011
|
+
|
5012
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
5013
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
5014
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
5015
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
5016
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
5017
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
5018
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
5019
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
5020
|
+
}
|
4895
5021
|
}
|
4896
5022
|
|
4897
5023
|
// Returns false if cancelled by progress_callback
|
@@ -5048,8 +5174,6 @@ static bool llm_load_tensors(
|
|
5048
5174
|
throw std::runtime_error("model has expert layers but no expert layers are used");
|
5049
5175
|
}
|
5050
5176
|
|
5051
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
5052
|
-
|
5053
5177
|
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
5054
5178
|
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
5055
5179
|
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
@@ -5069,12 +5193,10 @@ static bool llm_load_tensors(
|
|
5069
5193
|
// output
|
5070
5194
|
{
|
5071
5195
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5072
|
-
|
5073
|
-
|
5074
|
-
|
5075
|
-
|
5076
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5077
|
-
}
|
5196
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5197
|
+
// if output is NULL, init from the input tok embed
|
5198
|
+
if (model.output == NULL) {
|
5199
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5078
5200
|
}
|
5079
5201
|
}
|
5080
5202
|
|
@@ -5103,6 +5225,11 @@ static bool llm_load_tensors(
|
|
5103
5225
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5104
5226
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5105
5227
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5228
|
+
|
5229
|
+
// optional MLP bias
|
5230
|
+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5231
|
+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5232
|
+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5106
5233
|
} else {
|
5107
5234
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5108
5235
|
|
@@ -6210,6 +6337,70 @@ static bool llm_load_tensors(
|
|
6210
6337
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6211
6338
|
}
|
6212
6339
|
} break;
|
6340
|
+
case LLM_ARCH_DEEPSEEK2:
|
6341
|
+
{
|
6342
|
+
bool is_lite = (hparams.n_layer == 27);
|
6343
|
+
|
6344
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
6345
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
6346
|
+
const uint32_t q_lora_rank = hparams.n_lora_q;
|
6347
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
6348
|
+
const uint32_t n_ff_exp = hparams.n_ff_exp;
|
6349
|
+
|
6350
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6351
|
+
|
6352
|
+
// output
|
6353
|
+
{
|
6354
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6355
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6356
|
+
}
|
6357
|
+
|
6358
|
+
for (int i = 0; i < n_layer; ++i) {
|
6359
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6360
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6361
|
+
|
6362
|
+
auto & layer = model.layers[i];
|
6363
|
+
|
6364
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6365
|
+
if (!is_lite) {
|
6366
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
6367
|
+
}
|
6368
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
6369
|
+
|
6370
|
+
if (!is_lite) {
|
6371
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
6372
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
|
6373
|
+
} else {
|
6374
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
6375
|
+
}
|
6376
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
|
6377
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
|
6378
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
|
6379
|
+
|
6380
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6381
|
+
|
6382
|
+
if ((uint32_t) i < hparams.n_layer_dense_lead) {
|
6383
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
6384
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
6385
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6386
|
+
} else {
|
6387
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6388
|
+
|
6389
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
6390
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
6391
|
+
|
6392
|
+
// MoE branch
|
6393
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6394
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
6395
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6396
|
+
|
6397
|
+
// Shared expert branch
|
6398
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6399
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
|
6400
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6401
|
+
}
|
6402
|
+
}
|
6403
|
+
} break;
|
6213
6404
|
default:
|
6214
6405
|
throw std::runtime_error("unknown architecture");
|
6215
6406
|
}
|
@@ -6664,6 +6855,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6664
6855
|
int64_t n_expert_used,
|
6665
6856
|
llm_ffn_op_type type_op,
|
6666
6857
|
bool norm_w,
|
6858
|
+
bool scale_w,
|
6859
|
+
float w_scale,
|
6667
6860
|
const llm_build_cb & cb,
|
6668
6861
|
int il) {
|
6669
6862
|
int64_t n_embd = cur->ne[0];
|
@@ -6695,6 +6888,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6695
6888
|
|
6696
6889
|
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6697
6890
|
}
|
6891
|
+
if (scale_w) {
|
6892
|
+
weights = ggml_scale(ctx, weights, w_scale);
|
6893
|
+
cb(weights, "ffn_moe_weights_scaled", il);
|
6894
|
+
}
|
6698
6895
|
|
6699
6896
|
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6700
6897
|
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
@@ -6937,7 +7134,7 @@ struct llm_build_context {
|
|
6937
7134
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
6938
7135
|
const int32_t n_outputs;
|
6939
7136
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
6940
|
-
const int32_t
|
7137
|
+
const int32_t n_ctx_orig;
|
6941
7138
|
|
6942
7139
|
const bool flash_attn;
|
6943
7140
|
|
@@ -6986,7 +7183,7 @@ struct llm_build_context {
|
|
6986
7183
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
6987
7184
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
6988
7185
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
6989
|
-
|
7186
|
+
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
6990
7187
|
flash_attn (cparams.flash_attn),
|
6991
7188
|
pooling_type (cparams.pooling_type),
|
6992
7189
|
rope_type (hparams.rope_type),
|
@@ -7044,7 +7241,7 @@ struct llm_build_context {
|
|
7044
7241
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
7045
7242
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
7046
7243
|
0),
|
7047
|
-
lctx.inp_K_shift, rope_factors, n_rot, rope_type,
|
7244
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7048
7245
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7049
7246
|
|
7050
7247
|
cb(tmp, "K_shifted", il);
|
@@ -7153,7 +7350,7 @@ struct llm_build_context {
|
|
7153
7350
|
// choose long/short freq factors based on the context size
|
7154
7351
|
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7155
7352
|
|
7156
|
-
if (n_ctx_pre_seq > hparams.
|
7353
|
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
7157
7354
|
return model.layers[il].rope_long;
|
7158
7355
|
}
|
7159
7356
|
|
@@ -7269,14 +7466,14 @@ struct llm_build_context {
|
|
7269
7466
|
|
7270
7467
|
Qcur = ggml_rope_ext(
|
7271
7468
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7272
|
-
n_rot, rope_type,
|
7469
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7273
7470
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7274
7471
|
);
|
7275
7472
|
cb(Qcur, "Qcur", il);
|
7276
7473
|
|
7277
7474
|
Kcur = ggml_rope_ext(
|
7278
7475
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7279
|
-
n_rot, rope_type,
|
7476
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7280
7477
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7281
7478
|
);
|
7282
7479
|
cb(Kcur, "Kcur", il);
|
@@ -7305,9 +7502,9 @@ struct llm_build_context {
|
|
7305
7502
|
cb(cur, "ffn_norm", il);
|
7306
7503
|
|
7307
7504
|
cur = llm_build_ffn(ctx0, cur,
|
7308
|
-
model.layers[il].ffn_up,
|
7309
|
-
model.layers[il].ffn_gate,
|
7310
|
-
model.layers[il].ffn_down,
|
7505
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7506
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
|
7507
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7311
7508
|
NULL,
|
7312
7509
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
7313
7510
|
cb(cur, "ffn_out", il);
|
@@ -7325,6 +7522,7 @@ struct llm_build_context {
|
|
7325
7522
|
model.layers[il].ffn_down_exps,
|
7326
7523
|
n_expert, n_expert_used,
|
7327
7524
|
LLM_FFN_SILU, true,
|
7525
|
+
false, 0.0,
|
7328
7526
|
cb, il);
|
7329
7527
|
cb(cur, "ffn_moe_out", il);
|
7330
7528
|
}
|
@@ -7399,12 +7597,12 @@ struct llm_build_context {
|
|
7399
7597
|
case MODEL_7B:
|
7400
7598
|
Qcur = ggml_rope_ext(
|
7401
7599
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7402
|
-
n_rot, rope_type,
|
7600
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7403
7601
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7404
7602
|
);
|
7405
7603
|
Kcur = ggml_rope_ext(
|
7406
7604
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7407
|
-
n_rot, rope_type,
|
7605
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7408
7606
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7409
7607
|
);
|
7410
7608
|
break;
|
@@ -7511,14 +7709,14 @@ struct llm_build_context {
|
|
7511
7709
|
|
7512
7710
|
Qcur = ggml_rope_ext(
|
7513
7711
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7514
|
-
n_rot, rope_type,
|
7712
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7515
7713
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7516
7714
|
);
|
7517
7715
|
cb(Qcur, "Qcur", il);
|
7518
7716
|
|
7519
7717
|
Kcur = ggml_rope_ext(
|
7520
7718
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7521
|
-
n_rot, rope_type,
|
7719
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7522
7720
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7523
7721
|
);
|
7524
7722
|
cb(Kcur, "Kcur", il);
|
@@ -7631,13 +7829,13 @@ struct llm_build_context {
|
|
7631
7829
|
|
7632
7830
|
// using mode = 2 for neox mode
|
7633
7831
|
Qcur = ggml_rope_ext(
|
7634
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
7832
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
7635
7833
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7636
7834
|
);
|
7637
7835
|
cb(Qcur, "Qcur", il);
|
7638
7836
|
|
7639
7837
|
Kcur = ggml_rope_ext(
|
7640
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
7838
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
7641
7839
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7642
7840
|
);
|
7643
7841
|
cb(Kcur, "Kcur", il);
|
@@ -7755,14 +7953,14 @@ struct llm_build_context {
|
|
7755
7953
|
|
7756
7954
|
Qcur = ggml_rope_ext(
|
7757
7955
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7758
|
-
n_rot, rope_type,
|
7956
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7759
7957
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7760
7958
|
);
|
7761
7959
|
cb(Qcur, "Qcur", il);
|
7762
7960
|
|
7763
7961
|
Kcur = ggml_rope_ext(
|
7764
7962
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7765
|
-
n_rot, rope_type,
|
7963
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7766
7964
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7767
7965
|
);
|
7768
7966
|
cb(Kcur, "Kcur", il);
|
@@ -7806,6 +8004,7 @@ struct llm_build_context {
|
|
7806
8004
|
model.layers[il].ffn_down_exps,
|
7807
8005
|
n_expert, n_expert_used,
|
7808
8006
|
LLM_FFN_GELU, true,
|
8007
|
+
false, 0.0,
|
7809
8008
|
cb, il);
|
7810
8009
|
cb(cur, "ffn_moe_out", il);
|
7811
8010
|
|
@@ -7907,14 +8106,14 @@ struct llm_build_context {
|
|
7907
8106
|
|
7908
8107
|
Qcur = ggml_rope_ext(
|
7909
8108
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7910
|
-
n_rot, rope_type,
|
8109
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7911
8110
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7912
8111
|
);
|
7913
8112
|
cb(Qcur, "Qcur", il);
|
7914
8113
|
|
7915
8114
|
Kcur = ggml_rope_ext(
|
7916
8115
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7917
|
-
n_rot, rope_type,
|
8116
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
7918
8117
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7919
8118
|
);
|
7920
8119
|
cb(Kcur, "Kcur", il);
|
@@ -7949,6 +8148,7 @@ struct llm_build_context {
|
|
7949
8148
|
model.layers[il].ffn_down_exps,
|
7950
8149
|
n_expert, n_expert_used,
|
7951
8150
|
LLM_FFN_SILU, true,
|
8151
|
+
false, 0.0,
|
7952
8152
|
cb, il);
|
7953
8153
|
cb(cur, "ffn_moe_out", il);
|
7954
8154
|
|
@@ -8260,14 +8460,14 @@ struct llm_build_context {
|
|
8260
8460
|
|
8261
8461
|
Qcur = ggml_rope_ext(
|
8262
8462
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8263
|
-
n_rot, rope_type,
|
8463
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8264
8464
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8265
8465
|
);
|
8266
8466
|
cb(Qcur, "Qcur", il);
|
8267
8467
|
|
8268
8468
|
Kcur = ggml_rope_ext(
|
8269
8469
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8270
|
-
n_rot, rope_type,
|
8470
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8271
8471
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8272
8472
|
);
|
8273
8473
|
cb(Kcur, "Kcur", il);
|
@@ -8700,14 +8900,14 @@ struct llm_build_context {
|
|
8700
8900
|
|
8701
8901
|
Qcur = ggml_rope_ext(
|
8702
8902
|
ctx0, Qcur, inp_pos, nullptr,
|
8703
|
-
n_rot, rope_type,
|
8903
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8704
8904
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8705
8905
|
);
|
8706
8906
|
cb(Qcur, "Qcur", il);
|
8707
8907
|
|
8708
8908
|
Kcur = ggml_rope_ext(
|
8709
8909
|
ctx0, Kcur, inp_pos, nullptr,
|
8710
|
-
n_rot, rope_type,
|
8910
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8711
8911
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8712
8912
|
);
|
8713
8913
|
cb(Kcur, "Kcur", il);
|
@@ -8819,13 +9019,13 @@ struct llm_build_context {
|
|
8819
9019
|
|
8820
9020
|
// using mode = 2 for neox mode
|
8821
9021
|
Qcur = ggml_rope_ext(
|
8822
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
9022
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
8823
9023
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8824
9024
|
);
|
8825
9025
|
cb(Qcur, "Qcur", il);
|
8826
9026
|
|
8827
9027
|
Kcur = ggml_rope_ext(
|
8828
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
9028
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
8829
9029
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8830
9030
|
);
|
8831
9031
|
cb(Kcur, "Kcur", il);
|
@@ -8931,14 +9131,14 @@ struct llm_build_context {
|
|
8931
9131
|
|
8932
9132
|
Qcur = ggml_rope_ext(
|
8933
9133
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8934
|
-
n_rot, rope_type,
|
9134
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8935
9135
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8936
9136
|
);
|
8937
9137
|
cb(Qcur, "Qcur", il);
|
8938
9138
|
|
8939
9139
|
Kcur = ggml_rope_ext(
|
8940
9140
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8941
|
-
n_rot, rope_type,
|
9141
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
8942
9142
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8943
9143
|
);
|
8944
9144
|
cb(Kcur, "Kcur", il);
|
@@ -9045,14 +9245,14 @@ struct llm_build_context {
|
|
9045
9245
|
|
9046
9246
|
Qcur = ggml_rope_ext(
|
9047
9247
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9048
|
-
n_rot, rope_type,
|
9248
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9049
9249
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9050
9250
|
);
|
9051
9251
|
cb(Qcur, "Qcur", il);
|
9052
9252
|
|
9053
9253
|
Kcur = ggml_rope_ext(
|
9054
9254
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9055
|
-
n_rot, rope_type,
|
9255
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9056
9256
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9057
9257
|
);
|
9058
9258
|
cb(Kcur, "Kcur", il);
|
@@ -9087,6 +9287,7 @@ struct llm_build_context {
|
|
9087
9287
|
model.layers[il].ffn_down_exps,
|
9088
9288
|
n_expert, n_expert_used,
|
9089
9289
|
LLM_FFN_SILU, false,
|
9290
|
+
false, 0.0,
|
9090
9291
|
cb, il);
|
9091
9292
|
cb(cur, "ffn_moe_out", il);
|
9092
9293
|
|
@@ -9196,7 +9397,7 @@ struct llm_build_context {
|
|
9196
9397
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9197
9398
|
|
9198
9399
|
Qcur = ggml_rope_ext(
|
9199
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
9400
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
9200
9401
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9201
9402
|
);
|
9202
9403
|
cb(Qcur, "Qcur", il);
|
@@ -9207,7 +9408,7 @@ struct llm_build_context {
|
|
9207
9408
|
cb(Qcur, "Qcur", il);
|
9208
9409
|
|
9209
9410
|
Kcur = ggml_rope_ext(
|
9210
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
9411
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
9211
9412
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9212
9413
|
);
|
9213
9414
|
cb(Kcur, "Kcur", il);
|
@@ -9318,7 +9519,7 @@ struct llm_build_context {
|
|
9318
9519
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9319
9520
|
|
9320
9521
|
Qcur = ggml_rope_ext(
|
9321
|
-
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type,
|
9522
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
9322
9523
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9323
9524
|
);
|
9324
9525
|
cb(Qcur, "Qcur", il);
|
@@ -9327,7 +9528,7 @@ struct llm_build_context {
|
|
9327
9528
|
cb(Qcur, "Qcur", il);
|
9328
9529
|
|
9329
9530
|
Kcur = ggml_rope_ext(
|
9330
|
-
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type,
|
9531
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
9331
9532
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9332
9533
|
);
|
9333
9534
|
cb(Kcur, "Kcur", il);
|
@@ -9435,13 +9636,13 @@ struct llm_build_context {
|
|
9435
9636
|
|
9436
9637
|
Qcur = ggml_rope_ext(
|
9437
9638
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9438
|
-
n_embd_head, rope_type,
|
9639
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9439
9640
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9440
9641
|
cb(Qcur, "Qcur", il);
|
9441
9642
|
|
9442
9643
|
Kcur = ggml_rope_ext(
|
9443
9644
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9444
|
-
n_embd_head, rope_type,
|
9645
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9445
9646
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9446
9647
|
cb(Kcur, "Kcur", il);
|
9447
9648
|
|
@@ -9643,14 +9844,14 @@ struct llm_build_context {
|
|
9643
9844
|
|
9644
9845
|
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9645
9846
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9646
|
-
n_rot, rope_type,
|
9847
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9647
9848
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9648
9849
|
);
|
9649
9850
|
cb(Qcur, "Qcur", il);
|
9650
9851
|
|
9651
9852
|
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9652
9853
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9653
|
-
n_rot, rope_type,
|
9854
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9654
9855
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9655
9856
|
);
|
9656
9857
|
cb(Kcur, "Kcur", il);
|
@@ -9759,14 +9960,14 @@ struct llm_build_context {
|
|
9759
9960
|
|
9760
9961
|
Qcur = ggml_rope_ext(
|
9761
9962
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9762
|
-
n_rot, rope_type,
|
9963
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9763
9964
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9764
9965
|
);
|
9765
9966
|
cb(Qcur, "Qcur", il);
|
9766
9967
|
|
9767
9968
|
Kcur = ggml_rope_ext(
|
9768
9969
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9769
|
-
n_rot, rope_type,
|
9970
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9770
9971
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9771
9972
|
);
|
9772
9973
|
cb(Kcur, "Kcur", il);
|
@@ -9876,14 +10077,14 @@ struct llm_build_context {
|
|
9876
10077
|
|
9877
10078
|
Qcur = ggml_rope_ext(
|
9878
10079
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9879
|
-
n_rot, rope_type,
|
10080
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9880
10081
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9881
10082
|
);
|
9882
10083
|
cb(Qcur, "Qcur", il);
|
9883
10084
|
|
9884
10085
|
Kcur = ggml_rope_ext(
|
9885
10086
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9886
|
-
n_rot, rope_type,
|
10087
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
9887
10088
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9888
10089
|
);
|
9889
10090
|
cb(Kcur, "Kcur", il);
|
@@ -10006,14 +10207,14 @@ struct llm_build_context {
|
|
10006
10207
|
|
10007
10208
|
Qcur = ggml_rope_ext(
|
10008
10209
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10009
|
-
n_rot, rope_type,
|
10210
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10010
10211
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10011
10212
|
);
|
10012
10213
|
cb(Qcur, "Qcur", il);
|
10013
10214
|
|
10014
10215
|
Kcur = ggml_rope_ext(
|
10015
10216
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10016
|
-
n_rot, rope_type,
|
10217
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10017
10218
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10018
10219
|
);
|
10019
10220
|
cb(Kcur, "Kcur", il);
|
@@ -10078,7 +10279,7 @@ struct llm_build_context {
|
|
10078
10279
|
cb(cur, "lmhead_scaling", -1);
|
10079
10280
|
|
10080
10281
|
// lm_head
|
10081
|
-
cur = ggml_mul_mat(ctx0, model.
|
10282
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10082
10283
|
cb(cur, "result_output", -1);
|
10083
10284
|
|
10084
10285
|
ggml_build_forward_expand(gf, cur);
|
@@ -10126,7 +10327,7 @@ struct llm_build_context {
|
|
10126
10327
|
|
10127
10328
|
Qcur = ggml_rope_ext(
|
10128
10329
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10129
|
-
n_embd_head_k, rope_type,
|
10330
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10130
10331
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10131
10332
|
cb(Qcur, "Qcur", il);
|
10132
10333
|
|
@@ -10135,7 +10336,7 @@ struct llm_build_context {
|
|
10135
10336
|
|
10136
10337
|
Kcur = ggml_rope_ext(
|
10137
10338
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10138
|
-
n_embd_head_k, rope_type,
|
10339
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10139
10340
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10140
10341
|
cb(Kcur, "Kcur", il);
|
10141
10342
|
|
@@ -10246,14 +10447,14 @@ struct llm_build_context {
|
|
10246
10447
|
|
10247
10448
|
Qcur = ggml_rope_ext(
|
10248
10449
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10249
|
-
n_rot, rope_type,
|
10450
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10250
10451
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10251
10452
|
);
|
10252
10453
|
cb(Qcur, "Qcur", il);
|
10253
10454
|
|
10254
10455
|
Kcur = ggml_rope_ext(
|
10255
10456
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10256
|
-
n_rot, rope_type,
|
10457
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10257
10458
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10258
10459
|
);
|
10259
10460
|
cb(Kcur, "Kcur", il);
|
@@ -10536,14 +10737,14 @@ struct llm_build_context {
|
|
10536
10737
|
|
10537
10738
|
Qcur = ggml_rope_ext(
|
10538
10739
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10539
|
-
n_rot, rope_type,
|
10740
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10540
10741
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10541
10742
|
);
|
10542
10743
|
cb(Qcur, "Qcur", il);
|
10543
10744
|
|
10544
10745
|
Kcur = ggml_rope_ext(
|
10545
10746
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10546
|
-
n_rot, rope_type,
|
10747
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10547
10748
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10548
10749
|
);
|
10549
10750
|
cb(Kcur, "Kcur", il);
|
@@ -10667,14 +10868,14 @@ struct llm_build_context {
|
|
10667
10868
|
|
10668
10869
|
Qcur = ggml_rope_ext(
|
10669
10870
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10670
|
-
n_rot, rope_type,
|
10871
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10671
10872
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10672
10873
|
);
|
10673
10874
|
cb(Qcur, "Qcur", il);
|
10674
10875
|
|
10675
10876
|
Kcur = ggml_rope_ext(
|
10676
10877
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10677
|
-
n_rot, rope_type,
|
10878
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10678
10879
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10679
10880
|
);
|
10680
10881
|
cb(Kcur, "Kcur", il);
|
@@ -10781,14 +10982,14 @@ struct llm_build_context {
|
|
10781
10982
|
|
10782
10983
|
Qcur = ggml_rope_ext(
|
10783
10984
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10784
|
-
n_rot, rope_type,
|
10985
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10785
10986
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10786
10987
|
);
|
10787
10988
|
cb(Qcur, "Qcur", il);
|
10788
10989
|
|
10789
10990
|
Kcur = ggml_rope_ext(
|
10790
10991
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10791
|
-
n_rot, rope_type,
|
10992
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10792
10993
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10793
10994
|
);
|
10794
10995
|
cb(Kcur, "Kcur", il);
|
@@ -10916,14 +11117,14 @@ struct llm_build_context {
|
|
10916
11117
|
|
10917
11118
|
Qcur = ggml_rope_ext(
|
10918
11119
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10919
|
-
n_rot, rope_type,
|
11120
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10920
11121
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10921
11122
|
);
|
10922
11123
|
cb(Qcur, "Qcur", il);
|
10923
11124
|
|
10924
11125
|
Kcur = ggml_rope_ext(
|
10925
11126
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10926
|
-
n_rot, rope_type,
|
11127
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
10927
11128
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10928
11129
|
);
|
10929
11130
|
cb(Kcur, "Kcur", il);
|
@@ -10974,6 +11175,7 @@ struct llm_build_context {
|
|
10974
11175
|
model.layers[il].ffn_down_exps,
|
10975
11176
|
n_expert, n_expert_used,
|
10976
11177
|
LLM_FFN_SILU, true,
|
11178
|
+
false, 0.0,
|
10977
11179
|
cb, il);
|
10978
11180
|
cb(cur, "ffn_moe_out", il);
|
10979
11181
|
|
@@ -11005,6 +11207,239 @@ struct llm_build_context {
|
|
11005
11207
|
|
11006
11208
|
return gf;
|
11007
11209
|
}
|
11210
|
+
|
11211
|
+
struct ggml_cgraph * build_deepseek2() {
|
11212
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
11213
|
+
|
11214
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11215
|
+
int32_t n_tokens = this->n_tokens;
|
11216
|
+
|
11217
|
+
bool is_lite = (hparams.n_layer == 27);
|
11218
|
+
|
11219
|
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
11220
|
+
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
11221
|
+
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
11222
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
|
11223
|
+
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
11224
|
+
|
11225
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
11226
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
11227
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
11228
|
+
|
11229
|
+
struct ggml_tensor * cur;
|
11230
|
+
struct ggml_tensor * inpL;
|
11231
|
+
|
11232
|
+
// {n_embd, n_tokens}
|
11233
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
11234
|
+
|
11235
|
+
// inp_pos - contains the positions
|
11236
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
11237
|
+
|
11238
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11239
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11240
|
+
|
11241
|
+
for (int il = 0; il < n_layer; ++il) {
|
11242
|
+
struct ggml_tensor * inpSA = inpL;
|
11243
|
+
|
11244
|
+
// norm
|
11245
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11246
|
+
model.layers[il].attn_norm, NULL,
|
11247
|
+
LLM_NORM_RMS, cb, il);
|
11248
|
+
cb(cur, "attn_norm", il);
|
11249
|
+
|
11250
|
+
// self_attention
|
11251
|
+
{
|
11252
|
+
struct ggml_tensor * q = NULL;
|
11253
|
+
if (!is_lite) {
|
11254
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
11255
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
11256
|
+
cb(q, "q", il);
|
11257
|
+
|
11258
|
+
q = llm_build_norm(ctx0, q, hparams,
|
11259
|
+
model.layers[il].attn_q_a_norm, NULL,
|
11260
|
+
LLM_NORM_RMS, cb, il);
|
11261
|
+
cb(q, "q", il);
|
11262
|
+
|
11263
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
11264
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
11265
|
+
cb(q, "q", il);
|
11266
|
+
} else {
|
11267
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
11268
|
+
cb(q, "q", il);
|
11269
|
+
}
|
11270
|
+
|
11271
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11272
|
+
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
11273
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11274
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11275
|
+
0);
|
11276
|
+
cb(q_nope, "q_nope", il);
|
11277
|
+
|
11278
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
11279
|
+
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
11280
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11281
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11282
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
11283
|
+
cb(q_pe, "q_pe", il);
|
11284
|
+
|
11285
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
11286
|
+
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
11287
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
11288
|
+
|
11289
|
+
// split into {kv_lora_rank, n_tokens}
|
11290
|
+
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
11291
|
+
kv_pe_compresseed->nb[1],
|
11292
|
+
0);
|
11293
|
+
cb(kv_compressed, "kv_compressed", il);
|
11294
|
+
|
11295
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
11296
|
+
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
11297
|
+
kv_pe_compresseed->nb[1],
|
11298
|
+
kv_pe_compresseed->nb[1],
|
11299
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
11300
|
+
cb(k_pe, "k_pe", il);
|
11301
|
+
|
11302
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
11303
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
11304
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
11305
|
+
LLM_NORM_RMS, cb, il);
|
11306
|
+
cb(kv_compressed, "kv_compressed", il);
|
11307
|
+
|
11308
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
11309
|
+
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
11310
|
+
cb(kv, "kv", il);
|
11311
|
+
|
11312
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11313
|
+
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
11314
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
11315
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11316
|
+
0);
|
11317
|
+
cb(k_nope, "k_nope", il);
|
11318
|
+
|
11319
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
11320
|
+
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
11321
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11322
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
11323
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
11324
|
+
cb(v_states, "v_states", il);
|
11325
|
+
|
11326
|
+
v_states = ggml_cont(ctx0, v_states);
|
11327
|
+
cb(v_states, "v_states", il);
|
11328
|
+
|
11329
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
11330
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
11331
|
+
0);
|
11332
|
+
cb(v_states, "v_states", il);
|
11333
|
+
|
11334
|
+
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11335
|
+
q_pe = ggml_rope_ext(
|
11336
|
+
ctx0, q_pe, inp_pos, nullptr,
|
11337
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11338
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11339
|
+
);
|
11340
|
+
cb(q_pe, "q_pe", il);
|
11341
|
+
|
11342
|
+
// shared RoPE key
|
11343
|
+
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11344
|
+
k_pe = ggml_rope_ext(
|
11345
|
+
ctx0, k_pe, inp_pos, nullptr,
|
11346
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11347
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11348
|
+
);
|
11349
|
+
cb(k_pe, "k_pe", il);
|
11350
|
+
|
11351
|
+
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
11352
|
+
cb(q_states, "q_states", il);
|
11353
|
+
|
11354
|
+
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
11355
|
+
cb(k_states, "k_states", il);
|
11356
|
+
|
11357
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
11358
|
+
model.layers[il].wo, NULL,
|
11359
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
11360
|
+
}
|
11361
|
+
|
11362
|
+
if (il == n_layer - 1) {
|
11363
|
+
// skip computing output for unused tokens
|
11364
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11365
|
+
n_tokens = n_outputs;
|
11366
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
11367
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11368
|
+
}
|
11369
|
+
|
11370
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
11371
|
+
cb(ffn_inp, "ffn_inp", il);
|
11372
|
+
|
11373
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
11374
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11375
|
+
model.layers[il].ffn_norm, NULL,
|
11376
|
+
LLM_NORM_RMS, cb, il);
|
11377
|
+
cb(cur, "ffn_norm", il);
|
11378
|
+
|
11379
|
+
cur = llm_build_ffn(ctx0, cur,
|
11380
|
+
model.layers[il].ffn_up, NULL,
|
11381
|
+
model.layers[il].ffn_gate, NULL,
|
11382
|
+
model.layers[il].ffn_down, NULL,
|
11383
|
+
NULL,
|
11384
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11385
|
+
cb(cur, "ffn_out", il);
|
11386
|
+
} else {
|
11387
|
+
// MoE branch
|
11388
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11389
|
+
model.layers[il].ffn_norm, NULL,
|
11390
|
+
LLM_NORM_RMS, cb, il);
|
11391
|
+
cb(cur, "ffn_norm", il);
|
11392
|
+
|
11393
|
+
ggml_tensor * moe_out =
|
11394
|
+
llm_build_moe_ffn(ctx0, cur,
|
11395
|
+
model.layers[il].ffn_gate_inp,
|
11396
|
+
model.layers[il].ffn_up_exps,
|
11397
|
+
model.layers[il].ffn_gate_exps,
|
11398
|
+
model.layers[il].ffn_down_exps,
|
11399
|
+
n_expert, n_expert_used,
|
11400
|
+
LLM_FFN_SILU, false,
|
11401
|
+
true, hparams.expert_weights_scale,
|
11402
|
+
cb, il);
|
11403
|
+
cb(moe_out, "ffn_moe_out", il);
|
11404
|
+
|
11405
|
+
// FFN shared expert
|
11406
|
+
{
|
11407
|
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
|
11408
|
+
model.layers[il].ffn_up_shexp, NULL,
|
11409
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
11410
|
+
model.layers[il].ffn_down_shexp, NULL,
|
11411
|
+
NULL,
|
11412
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11413
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
11414
|
+
|
11415
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
11416
|
+
cb(cur, "ffn_out", il);
|
11417
|
+
}
|
11418
|
+
}
|
11419
|
+
|
11420
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
11421
|
+
cb(cur, "l_out", il);
|
11422
|
+
|
11423
|
+
// input for next layer
|
11424
|
+
inpL = cur;
|
11425
|
+
}
|
11426
|
+
|
11427
|
+
cur = inpL;
|
11428
|
+
|
11429
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
11430
|
+
model.output_norm, NULL,
|
11431
|
+
LLM_NORM_RMS, cb, -1);
|
11432
|
+
cb(cur, "result_norm", -1);
|
11433
|
+
|
11434
|
+
// lm_head
|
11435
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11436
|
+
cb(cur, "result_output", -1);
|
11437
|
+
|
11438
|
+
ggml_build_forward_expand(gf, cur);
|
11439
|
+
|
11440
|
+
return gf;
|
11441
|
+
}
|
11442
|
+
|
11008
11443
|
};
|
11009
11444
|
|
11010
11445
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -11223,6 +11658,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11223
11658
|
{
|
11224
11659
|
result = llm.build_arctic();
|
11225
11660
|
} break;
|
11661
|
+
case LLM_ARCH_DEEPSEEK2:
|
11662
|
+
{
|
11663
|
+
result = llm.build_deepseek2();
|
11664
|
+
} break;
|
11226
11665
|
default:
|
11227
11666
|
GGML_ASSERT(false);
|
11228
11667
|
}
|
@@ -12239,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
12239
12678
|
|
12240
12679
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
12241
12680
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12242
|
-
return vocab.id_to_token[id].
|
12681
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
12243
12682
|
}
|
12244
12683
|
|
12245
12684
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
12246
12685
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12247
|
-
return vocab.id_to_token[id].
|
12686
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
12248
12687
|
}
|
12249
12688
|
|
12250
12689
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
12251
12690
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12252
|
-
return vocab.id_to_token[id].
|
12691
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
12253
12692
|
}
|
12254
12693
|
|
12255
12694
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
12256
12695
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12257
|
-
return vocab.id_to_token[id].
|
12696
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
12258
12697
|
}
|
12259
12698
|
|
12260
12699
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
12261
12700
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12262
|
-
return vocab.id_to_token[id].
|
12701
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
12263
12702
|
}
|
12264
12703
|
|
12265
12704
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
@@ -12512,6 +12951,7 @@ struct llm_tokenizer_bpe {
|
|
12512
12951
|
});
|
12513
12952
|
break;
|
12514
12953
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12954
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
12515
12955
|
word_collection = unicode_regex_split(text, {
|
12516
12956
|
// same as llama3
|
12517
12957
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12734,7 +13174,7 @@ struct llm_tokenizer_wpm {
|
|
12734
13174
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
12735
13175
|
|
12736
13176
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12737
|
-
auto
|
13177
|
+
const auto & token_map = vocab.token_to_id;
|
12738
13178
|
|
12739
13179
|
// normalize and split by whitespace
|
12740
13180
|
std::vector<std::string> words = preprocess(text);
|
@@ -12749,108 +13189,89 @@ struct llm_tokenizer_wpm {
|
|
12749
13189
|
}
|
12750
13190
|
|
12751
13191
|
// prepend phantom space
|
12752
|
-
std::string word1 = "\xe2\x96\x81" + word;
|
12753
|
-
int n = word1.size();
|
13192
|
+
const std::string word1 = "\xe2\x96\x81" + word;
|
13193
|
+
const int n = word1.size();
|
12754
13194
|
|
12755
|
-
|
12756
|
-
int i = 0;
|
12757
|
-
bool match_any = false;
|
13195
|
+
const size_t current_tokens = output.size();
|
12758
13196
|
|
13197
|
+
// we're at the start of a new word
|
12759
13198
|
// move through character position in word
|
12760
|
-
|
13199
|
+
for (int i = 0; i < n; ++i) {
|
12761
13200
|
// loop through possible match length
|
12762
13201
|
bool match = false;
|
12763
13202
|
for (int j = n; j > i; j--) {
|
12764
|
-
auto it = token_map
|
12765
|
-
if (it != token_map
|
13203
|
+
auto it = token_map.find(word1.substr(i, j - i));
|
13204
|
+
if (it != token_map.end()) {
|
12766
13205
|
output.push_back(it->second);
|
12767
13206
|
match = true;
|
12768
|
-
|
12769
|
-
i = j;
|
13207
|
+
i = j - 1;
|
12770
13208
|
break;
|
12771
13209
|
}
|
12772
13210
|
}
|
12773
13211
|
|
12774
|
-
|
12775
|
-
|
12776
|
-
|
13212
|
+
if (!match) { // discard all
|
13213
|
+
output.resize(current_tokens);
|
13214
|
+
break; // and discard next tokens
|
12777
13215
|
}
|
12778
13216
|
}
|
12779
13217
|
|
12780
13218
|
// we didn't find any matches for this word
|
12781
|
-
if (
|
13219
|
+
if (current_tokens == output.size()) {
|
12782
13220
|
output.push_back(vocab.special_unk_id);
|
12783
13221
|
}
|
12784
13222
|
}
|
12785
13223
|
}
|
12786
13224
|
|
12787
13225
|
std::vector<std::string> preprocess(const std::string & text) {
|
12788
|
-
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
12789
|
-
|
12790
|
-
|
12791
|
-
|
12792
|
-
|
12793
|
-
|
12794
|
-
|
12795
|
-
|
13226
|
+
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13227
|
+
std::vector<std::string> words(1, "");
|
13228
|
+
|
13229
|
+
for (const char32_t cpt : cpts_nfd) {
|
13230
|
+
const auto flags = unicode_cpt_flags(cpt);
|
13231
|
+
|
13232
|
+
if (flags.is_whitespace) {
|
13233
|
+
if (words.back().size()) { // finish previous word if any
|
13234
|
+
words.emplace_back();
|
13235
|
+
}
|
12796
13236
|
continue;
|
12797
13237
|
}
|
12798
|
-
|
12799
|
-
|
12800
|
-
|
12801
|
-
|
12802
|
-
std::string s = unicode_cpt_to_utf8(code);
|
12803
|
-
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12804
|
-
new_str += " ";
|
12805
|
-
new_str += s;
|
12806
|
-
new_str += " ";
|
12807
|
-
} else {
|
12808
|
-
new_str += s;
|
13238
|
+
|
13239
|
+
assert (!flags.is_separator);
|
13240
|
+
if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
|
13241
|
+
continue;
|
12809
13242
|
}
|
12810
|
-
}
|
12811
13243
|
|
12812
|
-
|
12813
|
-
|
12814
|
-
|
12815
|
-
|
12816
|
-
|
12817
|
-
|
12818
|
-
|
12819
|
-
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
12820
|
-
l = r + 1;
|
12821
|
-
r = l;
|
13244
|
+
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
13245
|
+
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
13246
|
+
if (words.back().size()) { // finish previous word if any
|
13247
|
+
words.emplace_back();
|
13248
|
+
}
|
13249
|
+
words.back() = s; // single char word
|
13250
|
+
words.emplace_back(); // start a new word
|
12822
13251
|
} else {
|
12823
|
-
|
13252
|
+
words.back() += s; // append char to word
|
12824
13253
|
}
|
12825
13254
|
}
|
12826
|
-
if (r > l) {
|
12827
|
-
words.push_back(new_str.substr(l, (r - l)));
|
12828
|
-
}
|
12829
|
-
return words;
|
12830
|
-
}
|
12831
13255
|
|
12832
|
-
|
12833
|
-
|
12834
|
-
return false;
|
13256
|
+
if (!words.back().size()) {
|
13257
|
+
words.pop_back();
|
12835
13258
|
}
|
12836
|
-
|
12837
|
-
return
|
13259
|
+
|
13260
|
+
return words;
|
12838
13261
|
}
|
12839
13262
|
|
12840
|
-
bool is_chinese_char(uint32_t cpt) {
|
12841
|
-
|
12842
|
-
(cpt >=
|
13263
|
+
static bool is_chinese_char(uint32_t cpt) {
|
13264
|
+
return
|
13265
|
+
(cpt >= 0x04E00 && cpt <= 0x09FFF) ||
|
13266
|
+
(cpt >= 0x03400 && cpt <= 0x04DBF) ||
|
12843
13267
|
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
12844
13268
|
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
12845
13269
|
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
12846
13270
|
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
12847
|
-
(cpt >=
|
12848
|
-
(cpt >= 0x2F800 && cpt <= 0x2FA1F)
|
12849
|
-
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
12850
|
-
(cpt >= 0xFF00 && cpt <= 0xFFEF)
|
12851
|
-
return true; // NOLINT
|
12852
|
-
}
|
12853
|
-
return false;
|
13271
|
+
(cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
|
13272
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F);
|
13273
|
+
//(cpt >= 0x3000 && cpt <= 0x303F) ||
|
13274
|
+
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
12854
13275
|
}
|
12855
13276
|
|
12856
13277
|
const llama_vocab & vocab;
|
@@ -12894,9 +13315,9 @@ struct fragment_buffer_variant {
|
|
12894
13315
|
|
12895
13316
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
12896
13317
|
// for each special token
|
12897
|
-
for (const
|
12898
|
-
const auto &
|
12899
|
-
const auto &
|
13318
|
+
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
13319
|
+
const auto & data = vocab.id_to_token[special_id];
|
13320
|
+
const auto & special_token = data.text;
|
12900
13321
|
|
12901
13322
|
// for each text fragment
|
12902
13323
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
@@ -12905,7 +13326,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12905
13326
|
|
12906
13327
|
// if a fragment is text ( not yet processed )
|
12907
13328
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
12908
|
-
auto
|
13329
|
+
auto & raw_text = fragment.raw_text;
|
12909
13330
|
|
12910
13331
|
auto raw_text_base_offset = fragment.offset;
|
12911
13332
|
auto raw_text_base_length = fragment.length;
|
@@ -12915,7 +13336,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12915
13336
|
// find the first occurrence of a given special token in this fragment
|
12916
13337
|
// passing offset argument only limit the "search area" but match coordinates
|
12917
13338
|
// are still relative to the source full raw_text
|
12918
|
-
auto match = raw_text
|
13339
|
+
auto match = raw_text.find(special_token, raw_text_base_offset);
|
12919
13340
|
|
12920
13341
|
// no occurrences found, stop processing this fragment for a given special token
|
12921
13342
|
if (match == std::string::npos) break;
|
@@ -12933,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12933
13354
|
if (match > raw_text_base_offset) {
|
12934
13355
|
// left
|
12935
13356
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
12936
|
-
|
12937
|
-
|
13357
|
+
int64_t left_reminder_length = match - raw_text_base_offset;
|
13358
|
+
|
13359
|
+
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
13360
|
+
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
13361
|
+
left_reminder_length--;
|
13362
|
+
}
|
13363
|
+
}
|
13364
|
+
|
13365
|
+
if (left_reminder_length > 0) {
|
13366
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
13367
|
+
it++;
|
13368
|
+
}
|
12938
13369
|
|
12939
13370
|
#ifdef PRETOKENIZERDEBUG
|
12940
13371
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
12941
13372
|
#endif
|
12942
|
-
it++;
|
12943
13373
|
}
|
12944
13374
|
|
12945
13375
|
// special token
|
@@ -12948,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12948
13378
|
|
12949
13379
|
// right
|
12950
13380
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
12951
|
-
|
12952
|
-
|
12953
|
-
|
13381
|
+
int64_t right_reminder_offset = match + special_token.length();
|
13382
|
+
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
13383
|
+
|
13384
|
+
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
13385
|
+
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
13386
|
+
right_reminder_offset++;
|
13387
|
+
right_reminder_length--;
|
13388
|
+
}
|
13389
|
+
}
|
13390
|
+
|
13391
|
+
if (right_reminder_length > 0) {
|
13392
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
13393
|
+
it++;
|
13394
|
+
}
|
12954
13395
|
|
12955
13396
|
#ifdef PRETOKENIZERDEBUG
|
12956
13397
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
12957
13398
|
#endif
|
12958
13399
|
|
12959
|
-
it++;
|
12960
|
-
|
12961
13400
|
if (source == 0) {
|
12962
13401
|
buffer.erase_after(buffer.before_begin());
|
12963
13402
|
} else {
|
@@ -13003,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13003
13442
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
13004
13443
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
13005
13444
|
|
13006
|
-
static const bool rtrim = true; //TODO: as param
|
13007
13445
|
bool is_prev_special = false;
|
13008
|
-
bool special_token_rtrim = false;
|
13009
13446
|
|
13010
13447
|
if (add_special && vocab.special_add_bos != 0) {
|
13011
13448
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
@@ -13015,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13015
13452
|
|
13016
13453
|
for (const auto & fragment : fragment_buffer) {
|
13017
13454
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13018
|
-
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
13019
|
-
|
13020
|
-
// TODO: It's likely possible to get rid of this string copy entirely
|
13021
|
-
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
13022
|
-
// and passing 'add space prefix' as bool argument
|
13023
|
-
//
|
13024
13455
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
13025
13456
|
|
13026
|
-
if (special_token_rtrim) {
|
13027
|
-
size_t num_whitespaces = 0;
|
13028
|
-
while (isspace(raw_text[num_whitespaces])) {
|
13029
|
-
num_whitespaces++;
|
13030
|
-
}
|
13031
|
-
if (num_whitespaces == raw_text.size()) {
|
13032
|
-
continue; // skip if all whitespaces
|
13033
|
-
}
|
13034
|
-
raw_text = raw_text.substr(num_whitespaces);
|
13035
|
-
}
|
13036
|
-
|
13037
13457
|
if (vocab.add_space_prefix) {
|
13038
13458
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13039
13459
|
raw_text = " " + raw_text;
|
@@ -13049,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13049
13469
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13050
13470
|
output.push_back(fragment.token);
|
13051
13471
|
is_prev_special = true;
|
13052
|
-
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13053
|
-
special_token_rtrim = rtrim
|
13054
|
-
&& fragment.token != vocab.special_bos_id
|
13055
|
-
&& fragment.token != vocab.special_unk_id
|
13056
|
-
&& fragment.token != vocab.special_eos_id;
|
13057
13472
|
}
|
13058
13473
|
}
|
13059
13474
|
|
@@ -14054,7 +14469,7 @@ void llama_sample_repetition_penalties(
|
|
14054
14469
|
|
14055
14470
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
14056
14471
|
GGML_ASSERT(ctx);
|
14057
|
-
|
14472
|
+
int64_t t_start_sample_us = ggml_time_us();
|
14058
14473
|
|
14059
14474
|
bool allow_eog = false;
|
14060
14475
|
for (const auto & stack : grammar->stacks) {
|
@@ -14066,12 +14481,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
14066
14481
|
|
14067
14482
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
14068
14483
|
candidates_decoded.reserve(candidates->size);
|
14069
|
-
|
14484
|
+
|
14485
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
14070
14486
|
candidates_grammar.reserve(candidates->size);
|
14071
14487
|
|
14072
14488
|
for (size_t i = 0; i < candidates->size; ++i) {
|
14073
|
-
const llama_token id
|
14074
|
-
const std::string piece =
|
14489
|
+
const llama_token id = candidates->data[i].id;
|
14490
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
14075
14491
|
|
14076
14492
|
if (llama_token_is_eog(&ctx->model, id)) {
|
14077
14493
|
if (!allow_eog) {
|
@@ -14271,7 +14687,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
14271
14687
|
GGML_ASSERT(false);
|
14272
14688
|
}
|
14273
14689
|
|
14274
|
-
const std::string piece =
|
14690
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
14275
14691
|
|
14276
14692
|
// Note terminating 0 in decoded string
|
14277
14693
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -14287,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
14287
14703
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
14288
14704
|
}
|
14289
14705
|
|
14290
|
-
//
|
14291
|
-
// Beam search
|
14292
|
-
//
|
14293
|
-
|
14294
|
-
struct llama_beam {
|
14295
|
-
std::vector<llama_token> tokens;
|
14296
|
-
float p; // Cumulative beam probability (renormalized relative to all beams)
|
14297
|
-
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
14298
|
-
// Sort beams by probability. In case of ties, prefer beams at eob.
|
14299
|
-
bool operator<(const llama_beam & rhs) const {
|
14300
|
-
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
14301
|
-
}
|
14302
|
-
// Shift off first n tokens and discard them.
|
14303
|
-
void shift_tokens(const size_t n) {
|
14304
|
-
if (n) {
|
14305
|
-
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
14306
|
-
tokens.resize(tokens.size() - n);
|
14307
|
-
}
|
14308
|
-
}
|
14309
|
-
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
14310
|
-
};
|
14311
|
-
|
14312
|
-
// A struct for calculating logit-related info.
|
14313
|
-
struct llama_logit_info {
|
14314
|
-
const float * const logits;
|
14315
|
-
const int n_vocab;
|
14316
|
-
const float max_l;
|
14317
|
-
const float normalizer;
|
14318
|
-
struct sum_exp {
|
14319
|
-
float max_l;
|
14320
|
-
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
14321
|
-
};
|
14322
|
-
llama_logit_info(llama_context * ctx)
|
14323
|
-
: logits(llama_get_logits(ctx))
|
14324
|
-
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
14325
|
-
, max_l(*std::max_element(logits, logits + n_vocab))
|
14326
|
-
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
14327
|
-
{ }
|
14328
|
-
llama_token_data get_token_data(const llama_token token_id) const {
|
14329
|
-
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
14330
|
-
return {token_id, logits[token_id], p};
|
14331
|
-
}
|
14332
|
-
// Return top k token_data by logit.
|
14333
|
-
std::vector<llama_token_data> top_k(size_t k) {
|
14334
|
-
std::vector<llama_token_data> min_heap; // min-heap by logit
|
14335
|
-
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
14336
|
-
min_heap.reserve(k_min);
|
14337
|
-
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
14338
|
-
min_heap.push_back(get_token_data(token_id));
|
14339
|
-
}
|
14340
|
-
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
14341
|
-
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
14342
|
-
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
14343
|
-
if (min_heap.front().logit < logits[token_id]) {
|
14344
|
-
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
14345
|
-
min_heap.back().id = token_id;
|
14346
|
-
min_heap.back().logit = logits[token_id];
|
14347
|
-
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
14348
|
-
}
|
14349
|
-
}
|
14350
|
-
return min_heap;
|
14351
|
-
}
|
14352
|
-
float probability_from_logit(float logit) const {
|
14353
|
-
return normalizer * std::exp(logit - max_l);
|
14354
|
-
}
|
14355
|
-
};
|
14356
|
-
|
14357
|
-
struct llama_beam_search_data {
|
14358
|
-
llama_context * ctx;
|
14359
|
-
size_t n_beams;
|
14360
|
-
int n_past;
|
14361
|
-
int n_predict;
|
14362
|
-
std::vector<llama_beam> beams;
|
14363
|
-
std::vector<llama_beam> next_beams;
|
14364
|
-
|
14365
|
-
// Re-calculated on each loop iteration
|
14366
|
-
size_t common_prefix_length;
|
14367
|
-
|
14368
|
-
// Used to communicate to/from callback on beams state.
|
14369
|
-
std::vector<llama_beam_view> beam_views;
|
14370
|
-
|
14371
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
14372
|
-
: ctx(ctx)
|
14373
|
-
, n_beams(n_beams)
|
14374
|
-
, n_past(n_past)
|
14375
|
-
, n_predict(n_predict)
|
14376
|
-
, beam_views(n_beams) {
|
14377
|
-
beams.reserve(n_beams);
|
14378
|
-
next_beams.reserve(n_beams);
|
14379
|
-
}
|
14380
|
-
|
14381
|
-
// Collapse beams to a single beam given by index.
|
14382
|
-
void collapse_beams(const size_t beam_idx) {
|
14383
|
-
if (0u < beam_idx) {
|
14384
|
-
std::swap(beams[0], beams[beam_idx]);
|
14385
|
-
}
|
14386
|
-
beams.resize(1);
|
14387
|
-
}
|
14388
|
-
|
14389
|
-
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
14390
|
-
// The repetitive patterns below reflect the 2 stages of heaps:
|
14391
|
-
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
14392
|
-
// * If the heap is full and a new element is found that should be included, pop the
|
14393
|
-
// least element to the back(), replace it with the new, then push it into the heap.
|
14394
|
-
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
14395
|
-
// Min-heaps use a greater-than comparator.
|
14396
|
-
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
14397
|
-
if (beam.eob) {
|
14398
|
-
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
14399
|
-
if (next_beams.size() < n_beams) {
|
14400
|
-
next_beams.push_back(std::move(beam));
|
14401
|
-
if (next_beams.size() == n_beams) {
|
14402
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
14403
|
-
}
|
14404
|
-
} else if (next_beams.front().p < beam.p) {
|
14405
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
14406
|
-
next_beams.back() = std::move(beam);
|
14407
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
14408
|
-
}
|
14409
|
-
} else {
|
14410
|
-
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
14411
|
-
if (!beam.tokens.empty()) {
|
14412
|
-
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
14413
|
-
}
|
14414
|
-
llama_logit_info logit_info(ctx);
|
14415
|
-
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
14416
|
-
|
14417
|
-
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
14418
|
-
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
14419
|
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
14420
|
-
|
14421
|
-
size_t i=0;
|
14422
|
-
if (next_beams.size() < n_beams) {
|
14423
|
-
for (; next_beams.size() < n_beams ; ++i) {
|
14424
|
-
llama_beam next_beam = beam;
|
14425
|
-
next_beam.tokens.push_back(next_tokens[i].id);
|
14426
|
-
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
14427
|
-
next_beams.push_back(std::move(next_beam));
|
14428
|
-
}
|
14429
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
14430
|
-
} else {
|
14431
|
-
for (; next_beams.front().p == 0.0f ; ++i) {
|
14432
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
14433
|
-
next_beams.back() = beam;
|
14434
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
14435
|
-
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
14436
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
14437
|
-
}
|
14438
|
-
}
|
14439
|
-
for (; i < n_beams ; ++i) {
|
14440
|
-
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
14441
|
-
if (next_beams.front().p < next_p) {
|
14442
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
14443
|
-
next_beams.back() = beam;
|
14444
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
14445
|
-
next_beams.back().p = next_p;
|
14446
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
14447
|
-
}
|
14448
|
-
}
|
14449
|
-
}
|
14450
|
-
}
|
14451
|
-
|
14452
|
-
// Find common_prefix_length based on beams.
|
14453
|
-
// Requires beams is not empty.
|
14454
|
-
size_t find_common_prefix_length() {
|
14455
|
-
size_t common_prefix_length = beams[0].tokens.size();
|
14456
|
-
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
14457
|
-
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
14458
|
-
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
14459
|
-
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
14460
|
-
common_prefix_length = j;
|
14461
|
-
break;
|
14462
|
-
}
|
14463
|
-
}
|
14464
|
-
}
|
14465
|
-
return common_prefix_length;
|
14466
|
-
}
|
14467
|
-
|
14468
|
-
// Construct beams_state to send back to caller via the callback function.
|
14469
|
-
// Side effect: set common_prefix_length = find_common_prefix_length();
|
14470
|
-
llama_beams_state get_beams_state(const bool last_call) {
|
14471
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
14472
|
-
beam_views[i] = beams[i].view();
|
14473
|
-
}
|
14474
|
-
common_prefix_length = find_common_prefix_length();
|
14475
|
-
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
14476
|
-
}
|
14477
|
-
|
14478
|
-
// Loop:
|
14479
|
-
// * while i < n_predict, AND
|
14480
|
-
// * any of the beams have not yet reached end-of-beam (eob), AND
|
14481
|
-
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
14482
|
-
// (since all other beam probabilities can only decrease)
|
14483
|
-
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
14484
|
-
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
14485
|
-
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
14486
|
-
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
14487
|
-
!beams[top_beam_index()].eob ; ++i) {
|
14488
|
-
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
14489
|
-
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
14490
|
-
if (common_prefix_length) {
|
14491
|
-
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
14492
|
-
n_past += common_prefix_length;
|
14493
|
-
}
|
14494
|
-
// Zero-out next_beam probabilities to place them last in following min-heap.
|
14495
|
-
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
14496
|
-
for (llama_beam & beam : beams) {
|
14497
|
-
beam.shift_tokens(common_prefix_length);
|
14498
|
-
fill_next_beams_by_top_probabilities(beam);
|
14499
|
-
}
|
14500
|
-
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
14501
|
-
beams.swap(next_beams);
|
14502
|
-
renormalize_beam_probabilities(beams);
|
14503
|
-
}
|
14504
|
-
collapse_beams(top_beam_index());
|
14505
|
-
callback(callback_data, get_beams_state(true));
|
14506
|
-
}
|
14507
|
-
|
14508
|
-
// As beams grow, the cumulative probabilities decrease.
|
14509
|
-
// Renormalize them to avoid floating point underflow.
|
14510
|
-
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
14511
|
-
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
14512
|
-
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
14513
|
-
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
14514
|
-
}
|
14515
|
-
|
14516
|
-
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
14517
|
-
size_t top_beam_index() {
|
14518
|
-
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
14519
|
-
}
|
14520
|
-
|
14521
|
-
// Copy (p,eob) for each beam which may have been changed by the callback.
|
14522
|
-
void update_beams_from_beam_views() {
|
14523
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
14524
|
-
beams[i].p = beam_views[i].p;
|
14525
|
-
beams[i].eob = beam_views[i].eob;
|
14526
|
-
}
|
14527
|
-
}
|
14528
|
-
};
|
14529
|
-
|
14530
|
-
void llama_beam_search(llama_context * ctx,
|
14531
|
-
llama_beam_search_callback_fn_t callback, void * callback_data,
|
14532
|
-
size_t n_beams, int n_past, int n_predict) {
|
14533
|
-
assert(ctx);
|
14534
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
14535
|
-
|
14536
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
14537
|
-
|
14538
|
-
beam_search_data.loop(callback, callback_data);
|
14539
|
-
|
14540
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
14541
|
-
ctx->n_sample++;
|
14542
|
-
}
|
14543
|
-
|
14544
14706
|
//
|
14545
14707
|
// quantization
|
14546
14708
|
//
|
@@ -15751,7 +15913,7 @@ bool llama_supports_mlock(void) {
|
|
15751
15913
|
}
|
15752
15914
|
|
15753
15915
|
bool llama_supports_gpu_offload(void) {
|
15754
|
-
#if defined(GGML_USE_CUDA) || defined(
|
15916
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
15755
15917
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
15756
15918
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
15757
15919
|
return true;
|
@@ -15808,7 +15970,7 @@ struct llama_model * llama_load_model_from_file(
|
|
15808
15970
|
return true;
|
15809
15971
|
};
|
15810
15972
|
}
|
15811
|
-
if (params.rpc_servers != nullptr) {
|
15973
|
+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
15812
15974
|
// split the servers set them into model->rpc_servers
|
15813
15975
|
std::string servers(params.rpc_servers);
|
15814
15976
|
size_t pos = 0;
|
@@ -15862,6 +16024,11 @@ struct llama_context * llama_new_context_with_model(
|
|
15862
16024
|
params.flash_attn = false;
|
15863
16025
|
}
|
15864
16026
|
|
16027
|
+
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
16028
|
+
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
16029
|
+
return nullptr;
|
16030
|
+
}
|
16031
|
+
|
15865
16032
|
llama_context * ctx = new llama_context(*model);
|
15866
16033
|
|
15867
16034
|
const auto & hparams = model->hparams;
|
@@ -15900,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
|
|
15900
16067
|
|
15901
16068
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15902
16069
|
|
15903
|
-
cparams.
|
15904
|
-
hparams.
|
16070
|
+
cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
16071
|
+
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
|
15905
16072
|
hparams.n_ctx_train;
|
15906
16073
|
|
15907
16074
|
cparams.cb_eval = params.cb_eval;
|
@@ -15966,17 +16133,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15966
16133
|
|
15967
16134
|
if (!hparams.vocab_only) {
|
15968
16135
|
// initialize backends
|
15969
|
-
#if defined(
|
15970
|
-
for (auto & server : model->rpc_servers) {
|
15971
|
-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
15972
|
-
if (backend == nullptr) {
|
15973
|
-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
15974
|
-
llama_free(ctx);
|
15975
|
-
return nullptr;
|
15976
|
-
}
|
15977
|
-
ctx->backends.push_back(backend);
|
15978
|
-
}
|
15979
|
-
#elif defined(GGML_USE_METAL)
|
16136
|
+
#if defined(GGML_USE_METAL)
|
15980
16137
|
if (model->n_gpu_layers > 0) {
|
15981
16138
|
ctx->backend_metal = ggml_backend_metal_init();
|
15982
16139
|
if (ctx->backend_metal == nullptr) {
|
@@ -16015,7 +16172,7 @@ struct llama_context * llama_new_context_with_model(
|
|
16015
16172
|
return nullptr;
|
16016
16173
|
}
|
16017
16174
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
16018
|
-
ggml_backend_t backend = ggml_backend_vk_init(
|
16175
|
+
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
|
16019
16176
|
if (backend == nullptr) {
|
16020
16177
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
16021
16178
|
llama_free(ctx);
|
@@ -16068,6 +16225,19 @@ struct llama_context * llama_new_context_with_model(
|
|
16068
16225
|
}
|
16069
16226
|
ctx->backends.push_back(backend);
|
16070
16227
|
}
|
16228
|
+
#endif
|
16229
|
+
#if defined(GGML_USE_RPC)
|
16230
|
+
if (model->n_gpu_layers > 0) {
|
16231
|
+
for (const auto & endpoint : model->rpc_servers) {
|
16232
|
+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
16233
|
+
if (backend == nullptr) {
|
16234
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
16235
|
+
llama_free(ctx);
|
16236
|
+
return nullptr;
|
16237
|
+
}
|
16238
|
+
ctx->backends.push_back(backend);
|
16239
|
+
}
|
16240
|
+
}
|
16071
16241
|
#endif
|
16072
16242
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
16073
16243
|
if (ctx->backend_cpu == nullptr) {
|
@@ -16235,6 +16405,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16235
16405
|
case LLM_ARCH_COMMAND_R:
|
16236
16406
|
case LLM_ARCH_OLMO:
|
16237
16407
|
case LLM_ARCH_ARCTIC:
|
16408
|
+
case LLM_ARCH_DEEPSEEK2:
|
16238
16409
|
return LLAMA_ROPE_TYPE_NORM;
|
16239
16410
|
|
16240
16411
|
// the pairs of head values are offset by n_rot/2
|
@@ -17849,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
|
17849
18020
|
return model->vocab.id_to_token[token].score;
|
17850
18021
|
}
|
17851
18022
|
|
17852
|
-
|
18023
|
+
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
17853
18024
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
17854
|
-
return model->vocab.id_to_token[token].
|
18025
|
+
return model->vocab.id_to_token[token].attr;
|
17855
18026
|
}
|
17856
18027
|
|
17857
18028
|
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
@@ -17861,6 +18032,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
17861
18032
|
);
|
17862
18033
|
}
|
17863
18034
|
|
18035
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
18036
|
+
return llama_is_control_token(model->vocab, token);
|
18037
|
+
}
|
18038
|
+
|
17864
18039
|
llama_token llama_token_bos(const struct llama_model * model) {
|
17865
18040
|
return model->vocab.special_bos_id;
|
17866
18041
|
}
|
@@ -17932,7 +18107,16 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17932
18107
|
|
17933
18108
|
const auto cpts = unicode_cpts_from_utf8(text);
|
17934
18109
|
for (const auto cpt : cpts) {
|
17935
|
-
|
18110
|
+
const auto utf8 = unicode_cpt_to_utf8(cpt);
|
18111
|
+
try {
|
18112
|
+
decoded_text += unicode_utf8_to_byte(utf8);
|
18113
|
+
} catch (const std::out_of_range & e) {
|
18114
|
+
decoded_text += "[UNK_BYTE_0x";
|
18115
|
+
for (const auto c : utf8) {
|
18116
|
+
decoded_text += format("%02x", (uint8_t) c);
|
18117
|
+
}
|
18118
|
+
decoded_text += text + "]";
|
18119
|
+
}
|
17936
18120
|
}
|
17937
18121
|
|
17938
18122
|
return decoded_text;
|
@@ -17940,69 +18124,88 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17940
18124
|
|
17941
18125
|
// does not write null-terminator to buf
|
17942
18126
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
18127
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
18128
|
+
if (!special && llama_is_control_token(model->vocab, token)) {
|
18129
|
+
return 0;
|
18130
|
+
}
|
18131
|
+
|
18132
|
+
// if we have a cache - use it
|
18133
|
+
{
|
18134
|
+
const auto & cache = model->vocab.cache_token_to_piece;
|
18135
|
+
|
18136
|
+
if (!cache.empty()) {
|
18137
|
+
const auto & res = cache.at(token);
|
18138
|
+
if (length < (int) res.size()) {
|
18139
|
+
return -(int) res.size();
|
18140
|
+
}
|
18141
|
+
memcpy(buf, res.c_str(), res.size());
|
18142
|
+
return res.size();
|
18143
|
+
}
|
18144
|
+
}
|
18145
|
+
|
17943
18146
|
if (0 <= token && token < llama_n_vocab(model)) {
|
17944
18147
|
switch (llama_vocab_get_type(model->vocab)) {
|
17945
|
-
|
17946
|
-
|
17947
|
-
|
17948
|
-
|
17949
|
-
|
17950
|
-
|
17951
|
-
|
17952
|
-
|
17953
|
-
|
17954
|
-
|
17955
|
-
|
17956
|
-
|
17957
|
-
|
17958
|
-
|
17959
|
-
|
17960
|
-
|
17961
|
-
|
17962
|
-
|
17963
|
-
|
17964
|
-
|
17965
|
-
|
17966
|
-
|
17967
|
-
|
17968
|
-
|
17969
|
-
|
17970
|
-
|
17971
|
-
|
17972
|
-
|
17973
|
-
|
17974
|
-
|
18148
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
18149
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
18150
|
+
// NOTE: we accept all unsupported token types,
|
18151
|
+
// suppressing them like CONTROL tokens.
|
18152
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18153
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18154
|
+
llama_unescape_whitespace(result);
|
18155
|
+
if (length < (int) result.length()) {
|
18156
|
+
return -(int) result.length();
|
18157
|
+
}
|
18158
|
+
memcpy(buf, result.c_str(), result.length());
|
18159
|
+
return result.length();
|
18160
|
+
} else if (
|
18161
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18162
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18163
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18164
|
+
if (length < (int) result.length()) {
|
18165
|
+
return -(int) result.length();
|
18166
|
+
}
|
18167
|
+
memcpy(buf, result.c_str(), result.length());
|
18168
|
+
return result.length();
|
18169
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
18170
|
+
if (length < 3) {
|
18171
|
+
return -3;
|
18172
|
+
}
|
18173
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
18174
|
+
return 3;
|
18175
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
18176
|
+
if (length < 1) {
|
18177
|
+
return -1;
|
18178
|
+
}
|
18179
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
18180
|
+
return 1;
|
17975
18181
|
}
|
17976
|
-
|
17977
|
-
return 1;
|
18182
|
+
break;
|
17978
18183
|
}
|
17979
|
-
|
17980
|
-
|
17981
|
-
|
17982
|
-
|
17983
|
-
|
17984
|
-
|
17985
|
-
|
17986
|
-
|
17987
|
-
|
17988
|
-
|
17989
|
-
|
17990
|
-
|
17991
|
-
|
17992
|
-
|
17993
|
-
|
17994
|
-
(
|
17995
|
-
|
17996
|
-
|
17997
|
-
|
18184
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
18185
|
+
// NOTE: we accept all unsupported token types,
|
18186
|
+
// suppressing them like CONTROL tokens.
|
18187
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18188
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18189
|
+
result = llama_decode_text(result);
|
18190
|
+
if (length < (int) result.length()) {
|
18191
|
+
return -(int) result.length();
|
18192
|
+
}
|
18193
|
+
memcpy(buf, result.c_str(), result.length());
|
18194
|
+
return result.length();
|
18195
|
+
} else if (
|
18196
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18197
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18198
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18199
|
+
if (length < (int) result.length()) {
|
18200
|
+
return -(int) result.length();
|
18201
|
+
}
|
18202
|
+
memcpy(buf, result.c_str(), result.length());
|
18203
|
+
return result.length();
|
17998
18204
|
}
|
17999
|
-
|
18000
|
-
return result.length();
|
18205
|
+
break;
|
18001
18206
|
}
|
18002
|
-
|
18003
|
-
|
18004
|
-
default:
|
18005
|
-
GGML_ASSERT(false);
|
18207
|
+
default:
|
18208
|
+
GGML_ASSERT(false);
|
18006
18209
|
}
|
18007
18210
|
}
|
18008
18211
|
return 0;
|
@@ -18337,6 +18540,7 @@ const char * llama_print_system_info(void) {
|
|
18337
18540
|
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
18338
18541
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
18339
18542
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
18543
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
18340
18544
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
18341
18545
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
18342
18546
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|