llama_cpp 0.15.3 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +27 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +66 -36
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
- data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
- data/vendor/tmp/llama.cpp/ggml.c +301 -409
- data/vendor/tmp/llama.cpp/ggml.h +19 -23
- data/vendor/tmp/llama.cpp/llama.cpp +855 -651
- data/vendor/tmp/llama.cpp/llama.h +28 -48
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -13,8 +13,6 @@
|
|
|
13
13
|
|
|
14
14
|
#ifdef GGML_USE_CUDA
|
|
15
15
|
# include "ggml-cuda.h"
|
|
16
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
17
|
-
# include "ggml-opencl.h"
|
|
18
16
|
#elif defined(GGML_USE_VULKAN)
|
|
19
17
|
# include "ggml-vulkan.h"
|
|
20
18
|
#elif defined(GGML_USE_SYCL)
|
|
@@ -103,14 +101,14 @@
|
|
|
103
101
|
#endif
|
|
104
102
|
|
|
105
103
|
#define LLAMA_MAX_NODES 8192
|
|
106
|
-
#define LLAMA_MAX_EXPERTS
|
|
104
|
+
#define LLAMA_MAX_EXPERTS 160
|
|
107
105
|
|
|
108
106
|
//
|
|
109
107
|
// logging
|
|
110
108
|
//
|
|
111
109
|
|
|
112
110
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
113
|
-
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
|
111
|
+
static void llama_log_internal (ggml_log_level level, const char * format, ...);
|
|
114
112
|
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
|
115
113
|
|
|
116
114
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
@@ -222,6 +220,7 @@ enum llm_arch {
|
|
|
222
220
|
LLM_ARCH_DBRX,
|
|
223
221
|
LLM_ARCH_OLMO,
|
|
224
222
|
LLM_ARCH_ARCTIC,
|
|
223
|
+
LLM_ARCH_DEEPSEEK2,
|
|
225
224
|
LLM_ARCH_UNKNOWN,
|
|
226
225
|
};
|
|
227
226
|
|
|
@@ -259,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
259
258
|
{ LLM_ARCH_DBRX, "dbrx" },
|
|
260
259
|
{ LLM_ARCH_OLMO, "olmo" },
|
|
261
260
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
|
261
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
|
262
262
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
263
263
|
};
|
|
264
264
|
|
|
@@ -279,11 +279,15 @@ enum llm_kv {
|
|
|
279
279
|
LLM_KV_CONTEXT_LENGTH,
|
|
280
280
|
LLM_KV_EMBEDDING_LENGTH,
|
|
281
281
|
LLM_KV_BLOCK_COUNT,
|
|
282
|
+
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
|
282
283
|
LLM_KV_FEED_FORWARD_LENGTH,
|
|
284
|
+
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
283
285
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
|
284
286
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
|
285
287
|
LLM_KV_EXPERT_COUNT,
|
|
286
288
|
LLM_KV_EXPERT_USED_COUNT,
|
|
289
|
+
LLM_KV_EXPERT_SHARED_COUNT,
|
|
290
|
+
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
|
287
291
|
LLM_KV_POOLING_TYPE,
|
|
288
292
|
LLM_KV_LOGIT_SCALE,
|
|
289
293
|
|
|
@@ -296,6 +300,8 @@ enum llm_kv {
|
|
|
296
300
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
|
297
301
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
298
302
|
LLM_KV_ATTENTION_CAUSAL,
|
|
303
|
+
LLM_KV_ATTENTION_Q_LORA_RANK,
|
|
304
|
+
LLM_KV_ATTENTION_KV_LORA_RANK,
|
|
299
305
|
|
|
300
306
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
301
307
|
LLM_KV_ROPE_FREQ_BASE,
|
|
@@ -305,6 +311,7 @@ enum llm_kv {
|
|
|
305
311
|
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
|
306
312
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
307
313
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
|
314
|
+
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
308
315
|
|
|
309
316
|
LLM_KV_SPLIT_NO,
|
|
310
317
|
LLM_KV_SPLIT_COUNT,
|
|
@@ -353,17 +360,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
353
360
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
|
354
361
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
|
355
362
|
|
|
356
|
-
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size"
|
|
357
|
-
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length"
|
|
358
|
-
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length"
|
|
359
|
-
{ LLM_KV_BLOCK_COUNT, "%s.block_count"
|
|
360
|
-
{
|
|
361
|
-
{
|
|
362
|
-
{
|
|
363
|
-
{
|
|
364
|
-
{
|
|
365
|
-
{
|
|
366
|
-
{
|
|
363
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
|
364
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
365
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
|
366
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
|
367
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
|
368
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
|
369
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
|
370
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
|
371
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
|
372
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
|
373
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
|
374
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
|
375
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
|
376
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
|
377
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
367
378
|
|
|
368
379
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
|
369
380
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
@@ -374,6 +385,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
374
385
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
|
375
386
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
|
376
387
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
|
388
|
+
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
|
389
|
+
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
|
377
390
|
|
|
378
391
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
379
392
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
@@ -383,6 +396,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
383
396
|
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
|
384
397
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
|
385
398
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
|
399
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
|
386
400
|
|
|
387
401
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
|
388
402
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
|
@@ -474,6 +488,12 @@ enum llm_tensor {
|
|
|
474
488
|
LLM_TENSOR_SSM_A,
|
|
475
489
|
LLM_TENSOR_SSM_D,
|
|
476
490
|
LLM_TENSOR_SSM_OUT,
|
|
491
|
+
LLM_TENSOR_ATTN_Q_A,
|
|
492
|
+
LLM_TENSOR_ATTN_Q_B,
|
|
493
|
+
LLM_TENSOR_ATTN_KV_A_MQA,
|
|
494
|
+
LLM_TENSOR_ATTN_KV_B,
|
|
495
|
+
LLM_TENSOR_ATTN_Q_A_NORM,
|
|
496
|
+
LLM_TENSOR_ATTN_KV_A_NORM,
|
|
477
497
|
};
|
|
478
498
|
|
|
479
499
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
@@ -1057,6 +1077,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
1057
1077
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1058
1078
|
},
|
|
1059
1079
|
},
|
|
1080
|
+
{
|
|
1081
|
+
LLM_ARCH_DEEPSEEK2,
|
|
1082
|
+
{
|
|
1083
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1084
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1085
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1086
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1087
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
|
1088
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
|
1089
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1090
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
|
1091
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
|
1092
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
|
1093
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
|
1094
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1095
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1096
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1097
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1098
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1099
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1100
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1101
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1102
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1103
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
|
1104
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1105
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1106
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1107
|
+
},
|
|
1108
|
+
},
|
|
1060
1109
|
{
|
|
1061
1110
|
LLM_ARCH_UNKNOWN,
|
|
1062
1111
|
{
|
|
@@ -1651,12 +1700,13 @@ struct llama_mlock {
|
|
|
1651
1700
|
};
|
|
1652
1701
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|
1653
1702
|
|
|
1654
|
-
|
|
1703
|
+
// NOTE: avoid ever using this except for building the token_to_piece caches
|
|
1704
|
+
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
|
1655
1705
|
std::vector<char> result(8, 0);
|
|
1656
|
-
const int n_tokens = llama_token_to_piece(
|
|
1706
|
+
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
|
1657
1707
|
if (n_tokens < 0) {
|
|
1658
1708
|
result.resize(-n_tokens);
|
|
1659
|
-
int check = llama_token_to_piece(
|
|
1709
|
+
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
|
1660
1710
|
GGML_ASSERT(check == -n_tokens);
|
|
1661
1711
|
}
|
|
1662
1712
|
else {
|
|
@@ -1741,6 +1791,7 @@ enum e_model {
|
|
|
1741
1791
|
MODEL_13B,
|
|
1742
1792
|
MODEL_14B,
|
|
1743
1793
|
MODEL_15B,
|
|
1794
|
+
MODEL_16B,
|
|
1744
1795
|
MODEL_20B,
|
|
1745
1796
|
MODEL_30B,
|
|
1746
1797
|
MODEL_34B,
|
|
@@ -1748,6 +1799,7 @@ enum e_model {
|
|
|
1748
1799
|
MODEL_40B,
|
|
1749
1800
|
MODEL_65B,
|
|
1750
1801
|
MODEL_70B,
|
|
1802
|
+
MODEL_236B,
|
|
1751
1803
|
MODEL_314B,
|
|
1752
1804
|
MODEL_SMALL,
|
|
1753
1805
|
MODEL_MEDIUM,
|
|
@@ -1783,13 +1835,21 @@ struct llama_hparams {
|
|
|
1783
1835
|
uint32_t n_expert_used = 0;
|
|
1784
1836
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
|
1785
1837
|
|
|
1838
|
+
uint32_t n_layer_dense_lead = 0;
|
|
1839
|
+
uint32_t n_lora_q = 0;
|
|
1840
|
+
uint32_t n_lora_kv = 0;
|
|
1841
|
+
uint32_t n_ff_exp = 0;
|
|
1842
|
+
uint32_t n_expert_shared = 0;
|
|
1843
|
+
float expert_weights_scale = 0.0;
|
|
1844
|
+
|
|
1786
1845
|
float f_norm_eps;
|
|
1787
1846
|
float f_norm_rms_eps;
|
|
1788
1847
|
|
|
1789
1848
|
float rope_attn_factor = 1.0f;
|
|
1790
1849
|
float rope_freq_base_train;
|
|
1791
1850
|
float rope_freq_scale_train;
|
|
1792
|
-
uint32_t
|
|
1851
|
+
uint32_t n_ctx_orig_yarn;
|
|
1852
|
+
float rope_yarn_log_mul;
|
|
1793
1853
|
|
|
1794
1854
|
// for State Space Models
|
|
1795
1855
|
uint32_t ssm_d_conv = 0;
|
|
@@ -1823,8 +1883,14 @@ struct llama_hparams {
|
|
|
1823
1883
|
if (this->n_expert != other.n_expert) return true;
|
|
1824
1884
|
if (this->n_expert_used != other.n_expert_used) return true;
|
|
1825
1885
|
|
|
1886
|
+
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
|
1887
|
+
if (this->n_lora_q != other.n_lora_q) return true;
|
|
1888
|
+
if (this->n_lora_kv != other.n_lora_kv) return true;
|
|
1889
|
+
if (this->n_ff_exp != other.n_ff_exp) return true;
|
|
1890
|
+
if (this->n_expert_shared != other.n_expert_shared) return true;
|
|
1891
|
+
|
|
1826
1892
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
1827
|
-
if (this->
|
|
1893
|
+
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
|
1828
1894
|
|
|
1829
1895
|
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
|
1830
1896
|
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
|
@@ -1838,6 +1904,8 @@ struct llama_hparams {
|
|
|
1838
1904
|
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
|
1839
1905
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
|
1840
1906
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
|
1907
|
+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
|
1908
|
+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
|
1841
1909
|
|
|
1842
1910
|
return false;
|
|
1843
1911
|
}
|
|
@@ -1881,7 +1949,7 @@ struct llama_cparams {
|
|
|
1881
1949
|
float rope_freq_base;
|
|
1882
1950
|
float rope_freq_scale;
|
|
1883
1951
|
|
|
1884
|
-
uint32_t
|
|
1952
|
+
uint32_t n_ctx_orig_yarn;
|
|
1885
1953
|
// These hyperparameters are not exposed in GGUF, because all
|
|
1886
1954
|
// existing YaRN models use the same values for them.
|
|
1887
1955
|
float yarn_ext_factor;
|
|
@@ -1913,6 +1981,8 @@ struct llama_layer {
|
|
|
1913
1981
|
struct ggml_tensor * attn_k_norm_b;
|
|
1914
1982
|
struct ggml_tensor * attn_out_norm;
|
|
1915
1983
|
struct ggml_tensor * attn_out_norm_b;
|
|
1984
|
+
struct ggml_tensor * attn_q_a_norm;
|
|
1985
|
+
struct ggml_tensor * attn_kv_a_norm;
|
|
1916
1986
|
|
|
1917
1987
|
// attention
|
|
1918
1988
|
struct ggml_tensor * wq;
|
|
@@ -1920,6 +1990,10 @@ struct llama_layer {
|
|
|
1920
1990
|
struct ggml_tensor * wv;
|
|
1921
1991
|
struct ggml_tensor * wo;
|
|
1922
1992
|
struct ggml_tensor * wqkv;
|
|
1993
|
+
struct ggml_tensor * wq_a;
|
|
1994
|
+
struct ggml_tensor * wq_b;
|
|
1995
|
+
struct ggml_tensor * wkv_a_mqa;
|
|
1996
|
+
struct ggml_tensor * wkv_b;
|
|
1923
1997
|
|
|
1924
1998
|
// attention bias
|
|
1925
1999
|
struct ggml_tensor * bq;
|
|
@@ -1953,8 +2027,9 @@ struct llama_layer {
|
|
|
1953
2027
|
struct ggml_tensor * ffn_up_shexp;
|
|
1954
2028
|
|
|
1955
2029
|
// ff bias
|
|
1956
|
-
struct ggml_tensor *
|
|
1957
|
-
struct ggml_tensor *
|
|
2030
|
+
struct ggml_tensor * ffn_gate_b = nullptr;
|
|
2031
|
+
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
|
2032
|
+
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
|
1958
2033
|
struct ggml_tensor * ffn_act;
|
|
1959
2034
|
|
|
1960
2035
|
// mamba proj
|
|
@@ -2072,12 +2147,12 @@ struct llama_control_vector {
|
|
|
2072
2147
|
struct llama_vocab {
|
|
2073
2148
|
using id = int32_t;
|
|
2074
2149
|
using token = std::string;
|
|
2075
|
-
using
|
|
2150
|
+
using tattr = llama_token_attr;
|
|
2076
2151
|
|
|
2077
2152
|
struct token_data {
|
|
2078
2153
|
token text;
|
|
2079
2154
|
float score;
|
|
2080
|
-
|
|
2155
|
+
tattr attr;
|
|
2081
2156
|
};
|
|
2082
2157
|
|
|
2083
2158
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
@@ -2086,7 +2161,8 @@ struct llama_vocab {
|
|
|
2086
2161
|
std::unordered_map<token, id> token_to_id;
|
|
2087
2162
|
std::vector<token_data> id_to_token;
|
|
2088
2163
|
|
|
2089
|
-
std::
|
|
2164
|
+
std::vector<id> cache_special_tokens;
|
|
2165
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
|
2090
2166
|
|
|
2091
2167
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
|
2092
2168
|
|
|
@@ -2293,13 +2369,34 @@ struct llama_context {
|
|
|
2293
2369
|
struct llama_control_vector cvec;
|
|
2294
2370
|
};
|
|
2295
2371
|
|
|
2372
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
|
2373
|
+
size_t count = 1;
|
|
2374
|
+
#if defined(GGML_USE_CUDA)
|
|
2375
|
+
count = ggml_backend_cuda_get_device_count();
|
|
2376
|
+
#elif defined(GGML_USE_SYCL)
|
|
2377
|
+
count = ggml_backend_sycl_get_device_count();
|
|
2378
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2379
|
+
count = ggml_backend_vk_get_device_count();
|
|
2380
|
+
#endif
|
|
2381
|
+
#if defined(GGML_USE_RPC)
|
|
2382
|
+
count += model.rpc_servers.size();
|
|
2383
|
+
#endif
|
|
2384
|
+
return count;
|
|
2385
|
+
GGML_UNUSED(model);
|
|
2386
|
+
}
|
|
2387
|
+
|
|
2296
2388
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
|
2297
2389
|
ggml_backend_buffer_type_t buft = nullptr;
|
|
2298
2390
|
|
|
2299
|
-
#
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2391
|
+
#if defined(GGML_USE_RPC)
|
|
2392
|
+
int dev_count = (int)llama_get_device_count(model);
|
|
2393
|
+
int rpc_count = (int)model.rpc_servers.size();
|
|
2394
|
+
if (gpu >= dev_count - rpc_count) {
|
|
2395
|
+
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
|
2396
|
+
return ggml_backend_rpc_buffer_type(endpoint);
|
|
2397
|
+
}
|
|
2398
|
+
#endif
|
|
2399
|
+
#if defined(GGML_USE_METAL)
|
|
2303
2400
|
buft = ggml_backend_metal_buffer_type();
|
|
2304
2401
|
#elif defined(GGML_USE_CUDA)
|
|
2305
2402
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
@@ -2307,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|
|
2307
2404
|
buft = ggml_backend_vk_buffer_type(gpu);
|
|
2308
2405
|
#elif defined(GGML_USE_SYCL)
|
|
2309
2406
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
2310
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
2311
|
-
buft = ggml_backend_opencl_buffer_type();
|
|
2312
2407
|
#elif defined(GGML_USE_KOMPUTE)
|
|
2313
2408
|
buft = ggml_backend_kompute_buffer_type(gpu);
|
|
2314
2409
|
if (buft == nullptr) {
|
|
@@ -2347,29 +2442,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|
|
2347
2442
|
GGML_UNUSED(tensor_split);
|
|
2348
2443
|
}
|
|
2349
2444
|
|
|
2350
|
-
static size_t llama_get_device_count(const llama_model & model) {
|
|
2351
|
-
#if defined(GGML_USE_RPC)
|
|
2352
|
-
return model.rpc_servers.size();
|
|
2353
|
-
#elif defined(GGML_USE_CUDA)
|
|
2354
|
-
return ggml_backend_cuda_get_device_count();
|
|
2355
|
-
#elif defined(GGML_USE_SYCL)
|
|
2356
|
-
return ggml_backend_sycl_get_device_count();
|
|
2357
|
-
#elif defined(GGML_USE_VULKAN)
|
|
2358
|
-
return ggml_backend_vk_get_device_count();
|
|
2359
|
-
#else
|
|
2360
|
-
return 1;
|
|
2361
|
-
#endif
|
|
2362
|
-
GGML_UNUSED(model);
|
|
2363
|
-
}
|
|
2364
|
-
|
|
2365
2445
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
2366
2446
|
#if defined(GGML_USE_RPC)
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2447
|
+
int dev_count = (int)llama_get_device_count(model);
|
|
2448
|
+
int rpc_count = (int)model.rpc_servers.size();
|
|
2449
|
+
if (device >= dev_count - rpc_count) {
|
|
2450
|
+
size_t total;
|
|
2451
|
+
size_t free;
|
|
2452
|
+
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
|
2453
|
+
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
|
2454
|
+
return free;
|
|
2455
|
+
}
|
|
2456
|
+
#endif
|
|
2457
|
+
#if defined(GGML_USE_CUDA)
|
|
2373
2458
|
size_t total;
|
|
2374
2459
|
size_t free;
|
|
2375
2460
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
|
@@ -2441,10 +2526,6 @@ static bool llama_kv_cache_init(
|
|
|
2441
2526
|
}
|
|
2442
2527
|
}
|
|
2443
2528
|
|
|
2444
|
-
#ifdef GGML_USE_CLBLAST
|
|
2445
|
-
offload = false;
|
|
2446
|
-
#endif
|
|
2447
|
-
|
|
2448
2529
|
// count used buffer types
|
|
2449
2530
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
|
2450
2531
|
if (offload) {
|
|
@@ -3832,6 +3913,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
|
3832
3913
|
case MODEL_13B: return "13B";
|
|
3833
3914
|
case MODEL_14B: return "14B";
|
|
3834
3915
|
case MODEL_15B: return "15B";
|
|
3916
|
+
case MODEL_16B: return "16B";
|
|
3835
3917
|
case MODEL_20B: return "20B";
|
|
3836
3918
|
case MODEL_30B: return "30B";
|
|
3837
3919
|
case MODEL_34B: return "34B";
|
|
@@ -3839,6 +3921,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
|
3839
3921
|
case MODEL_40B: return "40B";
|
|
3840
3922
|
case MODEL_65B: return "65B";
|
|
3841
3923
|
case MODEL_70B: return "70B";
|
|
3924
|
+
case MODEL_236B: return "236B";
|
|
3842
3925
|
case MODEL_314B: return "314B";
|
|
3843
3926
|
case MODEL_SMALL: return "0.1B";
|
|
3844
3927
|
case MODEL_MEDIUM: return "0.4B";
|
|
@@ -3922,8 +4005,8 @@ static void llm_load_hparams(
|
|
|
3922
4005
|
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
3923
4006
|
hparams.rope_finetuned = rope_finetuned;
|
|
3924
4007
|
|
|
3925
|
-
hparams.
|
|
3926
|
-
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.
|
|
4008
|
+
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
|
4009
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
|
3927
4010
|
|
|
3928
4011
|
// rope_freq_base (optional)
|
|
3929
4012
|
hparams.rope_freq_base_train = 10000.0f;
|
|
@@ -3981,7 +4064,9 @@ static void llm_load_hparams(
|
|
|
3981
4064
|
switch (hparams.n_layer) {
|
|
3982
4065
|
case 22: model.type = e_model::MODEL_1B; break;
|
|
3983
4066
|
case 26: model.type = e_model::MODEL_3B; break;
|
|
3984
|
-
|
|
4067
|
+
// granite uses a vocab with len 49152
|
|
4068
|
+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
|
4069
|
+
case 36: model.type = e_model::MODEL_8B; break; // granite
|
|
3985
4070
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
3986
4071
|
case 48: model.type = e_model::MODEL_34B; break;
|
|
3987
4072
|
case 60: model.type = e_model::MODEL_30B; break;
|
|
@@ -4251,6 +4336,8 @@ static void llm_load_hparams(
|
|
|
4251
4336
|
case 30: model.type = e_model::MODEL_3B; break;
|
|
4252
4337
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
4253
4338
|
case 40: model.type = e_model::MODEL_15B; break;
|
|
4339
|
+
case 52: model.type = e_model::MODEL_20B; break; // granite
|
|
4340
|
+
case 88: model.type = e_model::MODEL_34B; break; // granite
|
|
4254
4341
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4255
4342
|
}
|
|
4256
4343
|
} break;
|
|
@@ -4384,6 +4471,26 @@ static void llm_load_hparams(
|
|
|
4384
4471
|
model.type = e_model::MODEL_UNKNOWN;
|
|
4385
4472
|
}
|
|
4386
4473
|
} break;
|
|
4474
|
+
case LLM_ARCH_DEEPSEEK2:
|
|
4475
|
+
{
|
|
4476
|
+
bool is_lite = (hparams.n_layer == 27);
|
|
4477
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
4478
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
4479
|
+
if (!is_lite) {
|
|
4480
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
4481
|
+
}
|
|
4482
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
4483
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
4484
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
4485
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
4486
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
|
4487
|
+
|
|
4488
|
+
switch (hparams.n_layer) {
|
|
4489
|
+
case 27: model.type = e_model::MODEL_16B; break;
|
|
4490
|
+
case 60: model.type = e_model::MODEL_236B; break;
|
|
4491
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4492
|
+
}
|
|
4493
|
+
} break;
|
|
4387
4494
|
default: (void)0;
|
|
4388
4495
|
}
|
|
4389
4496
|
|
|
@@ -4490,15 +4597,14 @@ static void llm_load_vocab(
|
|
|
4490
4597
|
vocab.special_cls_id = 101;
|
|
4491
4598
|
vocab.special_mask_id = 103;
|
|
4492
4599
|
vocab.add_space_prefix = false;
|
|
4493
|
-
} else {
|
|
4494
|
-
|
|
4495
|
-
|
|
4496
|
-
|
|
4497
|
-
|
|
4498
|
-
|
|
4499
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
|
4500
|
-
return;
|
|
4600
|
+
} else if (tokenizer_model == "gpt2") {
|
|
4601
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
|
4602
|
+
|
|
4603
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
|
4604
|
+
if (add_space_prefix_keyidx != -1) {
|
|
4605
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
4501
4606
|
}
|
|
4607
|
+
|
|
4502
4608
|
// read bpe merges and populate bpe ranks
|
|
4503
4609
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
|
4504
4610
|
if (merges_keyidx == -1) {
|
|
@@ -4532,6 +4638,8 @@ static void llm_load_vocab(
|
|
|
4532
4638
|
vocab.special_pad_id = -1;
|
|
4533
4639
|
vocab.special_cls_id = -1;
|
|
4534
4640
|
vocab.special_mask_id = -1;
|
|
4641
|
+
} else {
|
|
4642
|
+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
|
4535
4643
|
}
|
|
4536
4644
|
|
|
4537
4645
|
// for now, only BPE models have pre-tokenizers
|
|
@@ -4593,6 +4701,9 @@ static void llm_load_vocab(
|
|
|
4593
4701
|
} else if (
|
|
4594
4702
|
tokenizer_pre == "dbrx") {
|
|
4595
4703
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
|
4704
|
+
} else if (
|
|
4705
|
+
tokenizer_pre == "smaug-bpe") {
|
|
4706
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
|
4596
4707
|
} else {
|
|
4597
4708
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
4598
4709
|
}
|
|
@@ -4631,7 +4742,20 @@ static void llm_load_vocab(
|
|
|
4631
4742
|
auto & token_data = vocab.id_to_token[i];
|
|
4632
4743
|
token_data.text = std::move(word);
|
|
4633
4744
|
token_data.score = scores ? scores[i] : 0.0f;
|
|
4634
|
-
token_data.
|
|
4745
|
+
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
|
4746
|
+
|
|
4747
|
+
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
|
4748
|
+
switch(toktypes[i]) {
|
|
4749
|
+
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
|
4750
|
+
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
|
4751
|
+
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
|
4752
|
+
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
|
4753
|
+
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
|
4754
|
+
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
|
4755
|
+
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
4756
|
+
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
4757
|
+
}
|
|
4758
|
+
}
|
|
4635
4759
|
}
|
|
4636
4760
|
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
|
4637
4761
|
|
|
@@ -4721,96 +4845,88 @@ static void llm_load_vocab(
|
|
|
4721
4845
|
|
|
4722
4846
|
// build special tokens cache
|
|
4723
4847
|
{
|
|
4724
|
-
|
|
4725
|
-
|
|
4726
|
-
|
|
4727
|
-
|
|
4728
|
-
|
|
4729
|
-
// From testing, this appears to correlate 1:1 with special tokens.
|
|
4730
|
-
//
|
|
4848
|
+
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
|
4849
|
+
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
|
4850
|
+
vocab.cache_special_tokens.push_back(id);
|
|
4851
|
+
}
|
|
4852
|
+
}
|
|
4731
4853
|
|
|
4732
|
-
|
|
4733
|
-
|
|
4734
|
-
|
|
4735
|
-
|
|
4736
|
-
|
|
4854
|
+
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
|
4855
|
+
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
|
4856
|
+
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
|
4857
|
+
}
|
|
4858
|
+
);
|
|
4737
4859
|
|
|
4738
|
-
|
|
4860
|
+
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
|
4861
|
+
}
|
|
4739
4862
|
|
|
4740
|
-
|
|
4741
|
-
|
|
4742
|
-
|
|
4863
|
+
// build token to piece cache
|
|
4864
|
+
{
|
|
4865
|
+
size_t size_cache = 0;
|
|
4743
4866
|
|
|
4744
|
-
|
|
4745
|
-
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
|
4746
|
-
special_tokens_count_by_type++;
|
|
4747
|
-
}
|
|
4867
|
+
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
|
|
4748
4868
|
|
|
4749
|
-
|
|
4750
|
-
|
|
4751
|
-
bool is_tokenizable = false;
|
|
4869
|
+
for (uint32_t id = 0; id < n_vocab; ++id) {
|
|
4870
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
|
|
4752
4871
|
|
|
4753
|
-
|
|
4754
|
-
|
|
4755
|
-
for (unsigned i = 1; i < token.length();) {
|
|
4756
|
-
const auto left = token.substr(0, i);
|
|
4757
|
-
const auto right = token.substr(i);
|
|
4872
|
+
size_cache += cache_token_to_piece[id].size();
|
|
4873
|
+
}
|
|
4758
4874
|
|
|
4759
|
-
|
|
4760
|
-
auto utf = utf8_len(left.at(left.length() - 1));
|
|
4875
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
|
4761
4876
|
|
|
4762
|
-
|
|
4763
|
-
|
|
4764
|
-
|
|
4765
|
-
|
|
4766
|
-
|
|
4767
|
-
|
|
4768
|
-
|
|
4769
|
-
|
|
4770
|
-
|
|
4771
|
-
|
|
4772
|
-
|
|
4877
|
+
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
|
4878
|
+
}
|
|
4879
|
+
|
|
4880
|
+
// Handle per token attributes
|
|
4881
|
+
//NOTE: Each model customizes per token attributes.
|
|
4882
|
+
//NOTE: Per token attributes are missing from the GGUF file.
|
|
4883
|
+
//TODO: Extract attributes from GGUF file.
|
|
4884
|
+
{
|
|
4885
|
+
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
|
4886
|
+
for (auto substr : substrs) {
|
|
4887
|
+
if (str.find(substr) < std::string::npos) {
|
|
4888
|
+
return true;
|
|
4773
4889
|
}
|
|
4890
|
+
}
|
|
4891
|
+
return false;
|
|
4892
|
+
};
|
|
4774
4893
|
|
|
4775
|
-
|
|
4776
|
-
|
|
4777
|
-
|
|
4894
|
+
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
|
4895
|
+
uint32_t current = vocab.id_to_token.at(id).attr;
|
|
4896
|
+
current = value ? (current | attr) : (current & ~attr);
|
|
4897
|
+
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
|
4898
|
+
};
|
|
4778
4899
|
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
|
|
4782
|
-
utf8_str_len++;
|
|
4783
|
-
i += utf8_len(token.at(i));
|
|
4784
|
-
}
|
|
4900
|
+
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
|
4901
|
+
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
|
4902
|
+
};
|
|
4785
4903
|
|
|
4786
|
-
|
|
4787
|
-
|
|
4788
|
-
// At this point what we have left are special tokens only
|
|
4789
|
-
vocab.special_tokens_cache[token] = id;
|
|
4904
|
+
std::string model_name;
|
|
4905
|
+
std::string tokenizer_pre;
|
|
4790
4906
|
|
|
4791
|
-
|
|
4792
|
-
|
|
4907
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
|
4908
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
4793
4909
|
|
|
4794
|
-
|
|
4795
|
-
|
|
4796
|
-
|
|
4797
|
-
|
|
4798
|
-
}
|
|
4799
|
-
}
|
|
4910
|
+
// model name to lowercase
|
|
4911
|
+
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
|
4912
|
+
[] (const std::string::value_type x) {
|
|
4913
|
+
return std::tolower(x);
|
|
4800
4914
|
}
|
|
4801
|
-
|
|
4915
|
+
);
|
|
4802
4916
|
|
|
4803
|
-
|
|
4804
|
-
|
|
4805
|
-
|
|
4806
|
-
|
|
4807
|
-
|
|
4808
|
-
|
|
4809
|
-
|
|
4810
|
-
|
|
4811
|
-
|
|
4812
|
-
|
|
4813
|
-
)
|
|
4917
|
+
// set attributes by model/tokenizer name
|
|
4918
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
|
4919
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
4920
|
+
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
4921
|
+
for (auto id : vocab.cache_special_tokens) {
|
|
4922
|
+
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
4923
|
+
}
|
|
4924
|
+
for (auto token : {"</s>"}) {
|
|
4925
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
4926
|
+
}
|
|
4927
|
+
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
|
4928
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
|
4929
|
+
}
|
|
4814
4930
|
}
|
|
4815
4931
|
}
|
|
4816
4932
|
}
|
|
@@ -4852,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
4852
4968
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
4853
4969
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
4854
4970
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4855
|
-
LLAMA_LOG_INFO("%s:
|
|
4971
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
4856
4972
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
4857
4973
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
4858
4974
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
@@ -4892,6 +5008,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
4892
5008
|
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
|
4893
5009
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
|
4894
5010
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
|
5011
|
+
|
|
5012
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
|
5013
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
5014
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
5015
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
5016
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
5017
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
5018
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
5019
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
5020
|
+
}
|
|
4895
5021
|
}
|
|
4896
5022
|
|
|
4897
5023
|
// Returns false if cancelled by progress_callback
|
|
@@ -5048,8 +5174,6 @@ static bool llm_load_tensors(
|
|
|
5048
5174
|
throw std::runtime_error("model has expert layers but no expert layers are used");
|
|
5049
5175
|
}
|
|
5050
5176
|
|
|
5051
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
5052
|
-
|
|
5053
5177
|
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
|
5054
5178
|
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
|
5055
5179
|
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
|
@@ -5069,12 +5193,10 @@ static bool llm_load_tensors(
|
|
|
5069
5193
|
// output
|
|
5070
5194
|
{
|
|
5071
5195
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5072
|
-
|
|
5073
|
-
|
|
5074
|
-
|
|
5075
|
-
|
|
5076
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5077
|
-
}
|
|
5196
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5197
|
+
// if output is NULL, init from the input tok embed
|
|
5198
|
+
if (model.output == NULL) {
|
|
5199
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5078
5200
|
}
|
|
5079
5201
|
}
|
|
5080
5202
|
|
|
@@ -5103,6 +5225,11 @@ static bool llm_load_tensors(
|
|
|
5103
5225
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5104
5226
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
5105
5227
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5228
|
+
|
|
5229
|
+
// optional MLP bias
|
|
5230
|
+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5231
|
+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5232
|
+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5106
5233
|
} else {
|
|
5107
5234
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
5108
5235
|
|
|
@@ -6210,6 +6337,70 @@ static bool llm_load_tensors(
|
|
|
6210
6337
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
6211
6338
|
}
|
|
6212
6339
|
} break;
|
|
6340
|
+
case LLM_ARCH_DEEPSEEK2:
|
|
6341
|
+
{
|
|
6342
|
+
bool is_lite = (hparams.n_layer == 27);
|
|
6343
|
+
|
|
6344
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
6345
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
6346
|
+
const uint32_t q_lora_rank = hparams.n_lora_q;
|
|
6347
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
6348
|
+
const uint32_t n_ff_exp = hparams.n_ff_exp;
|
|
6349
|
+
|
|
6350
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
6351
|
+
|
|
6352
|
+
// output
|
|
6353
|
+
{
|
|
6354
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
6355
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
6356
|
+
}
|
|
6357
|
+
|
|
6358
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6359
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
6360
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
6361
|
+
|
|
6362
|
+
auto & layer = model.layers[i];
|
|
6363
|
+
|
|
6364
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
6365
|
+
if (!is_lite) {
|
|
6366
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
|
6367
|
+
}
|
|
6368
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
|
6369
|
+
|
|
6370
|
+
if (!is_lite) {
|
|
6371
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
|
6372
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
|
|
6373
|
+
} else {
|
|
6374
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
|
6375
|
+
}
|
|
6376
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
|
|
6377
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
|
|
6378
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
|
|
6379
|
+
|
|
6380
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
6381
|
+
|
|
6382
|
+
if ((uint32_t) i < hparams.n_layer_dense_lead) {
|
|
6383
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
6384
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
6385
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
6386
|
+
} else {
|
|
6387
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
6388
|
+
|
|
6389
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
|
6390
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
|
6391
|
+
|
|
6392
|
+
// MoE branch
|
|
6393
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
|
6394
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
|
6395
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
|
6396
|
+
|
|
6397
|
+
// Shared expert branch
|
|
6398
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
|
6399
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
|
|
6400
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
|
6401
|
+
}
|
|
6402
|
+
}
|
|
6403
|
+
} break;
|
|
6213
6404
|
default:
|
|
6214
6405
|
throw std::runtime_error("unknown architecture");
|
|
6215
6406
|
}
|
|
@@ -6664,6 +6855,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
|
6664
6855
|
int64_t n_expert_used,
|
|
6665
6856
|
llm_ffn_op_type type_op,
|
|
6666
6857
|
bool norm_w,
|
|
6858
|
+
bool scale_w,
|
|
6859
|
+
float w_scale,
|
|
6667
6860
|
const llm_build_cb & cb,
|
|
6668
6861
|
int il) {
|
|
6669
6862
|
int64_t n_embd = cur->ne[0];
|
|
@@ -6695,6 +6888,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
|
6695
6888
|
|
|
6696
6889
|
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
|
6697
6890
|
}
|
|
6891
|
+
if (scale_w) {
|
|
6892
|
+
weights = ggml_scale(ctx, weights, w_scale);
|
|
6893
|
+
cb(weights, "ffn_moe_weights_scaled", il);
|
|
6894
|
+
}
|
|
6698
6895
|
|
|
6699
6896
|
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
|
6700
6897
|
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
@@ -6937,7 +7134,7 @@ struct llm_build_context {
|
|
|
6937
7134
|
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
|
6938
7135
|
const int32_t n_outputs;
|
|
6939
7136
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
|
6940
|
-
const int32_t
|
|
7137
|
+
const int32_t n_ctx_orig;
|
|
6941
7138
|
|
|
6942
7139
|
const bool flash_attn;
|
|
6943
7140
|
|
|
@@ -6986,7 +7183,7 @@ struct llm_build_context {
|
|
|
6986
7183
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
|
6987
7184
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
|
6988
7185
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
|
6989
|
-
|
|
7186
|
+
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
|
6990
7187
|
flash_attn (cparams.flash_attn),
|
|
6991
7188
|
pooling_type (cparams.pooling_type),
|
|
6992
7189
|
rope_type (hparams.rope_type),
|
|
@@ -7044,7 +7241,7 @@ struct llm_build_context {
|
|
|
7044
7241
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
|
7045
7242
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
7046
7243
|
0),
|
|
7047
|
-
lctx.inp_K_shift, rope_factors, n_rot, rope_type,
|
|
7244
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7048
7245
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7049
7246
|
|
|
7050
7247
|
cb(tmp, "K_shifted", il);
|
|
@@ -7153,7 +7350,7 @@ struct llm_build_context {
|
|
|
7153
7350
|
// choose long/short freq factors based on the context size
|
|
7154
7351
|
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
7155
7352
|
|
|
7156
|
-
if (n_ctx_pre_seq > hparams.
|
|
7353
|
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
|
7157
7354
|
return model.layers[il].rope_long;
|
|
7158
7355
|
}
|
|
7159
7356
|
|
|
@@ -7269,14 +7466,14 @@ struct llm_build_context {
|
|
|
7269
7466
|
|
|
7270
7467
|
Qcur = ggml_rope_ext(
|
|
7271
7468
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7272
|
-
n_rot, rope_type,
|
|
7469
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7273
7470
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7274
7471
|
);
|
|
7275
7472
|
cb(Qcur, "Qcur", il);
|
|
7276
7473
|
|
|
7277
7474
|
Kcur = ggml_rope_ext(
|
|
7278
7475
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7279
|
-
n_rot, rope_type,
|
|
7476
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7280
7477
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7281
7478
|
);
|
|
7282
7479
|
cb(Kcur, "Kcur", il);
|
|
@@ -7305,9 +7502,9 @@ struct llm_build_context {
|
|
|
7305
7502
|
cb(cur, "ffn_norm", il);
|
|
7306
7503
|
|
|
7307
7504
|
cur = llm_build_ffn(ctx0, cur,
|
|
7308
|
-
model.layers[il].ffn_up,
|
|
7309
|
-
model.layers[il].ffn_gate,
|
|
7310
|
-
model.layers[il].ffn_down,
|
|
7505
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
7506
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
|
|
7507
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
7311
7508
|
NULL,
|
|
7312
7509
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
7313
7510
|
cb(cur, "ffn_out", il);
|
|
@@ -7325,6 +7522,7 @@ struct llm_build_context {
|
|
|
7325
7522
|
model.layers[il].ffn_down_exps,
|
|
7326
7523
|
n_expert, n_expert_used,
|
|
7327
7524
|
LLM_FFN_SILU, true,
|
|
7525
|
+
false, 0.0,
|
|
7328
7526
|
cb, il);
|
|
7329
7527
|
cb(cur, "ffn_moe_out", il);
|
|
7330
7528
|
}
|
|
@@ -7399,12 +7597,12 @@ struct llm_build_context {
|
|
|
7399
7597
|
case MODEL_7B:
|
|
7400
7598
|
Qcur = ggml_rope_ext(
|
|
7401
7599
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7402
|
-
n_rot, rope_type,
|
|
7600
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7403
7601
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7404
7602
|
);
|
|
7405
7603
|
Kcur = ggml_rope_ext(
|
|
7406
7604
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7407
|
-
n_rot, rope_type,
|
|
7605
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7408
7606
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7409
7607
|
);
|
|
7410
7608
|
break;
|
|
@@ -7511,14 +7709,14 @@ struct llm_build_context {
|
|
|
7511
7709
|
|
|
7512
7710
|
Qcur = ggml_rope_ext(
|
|
7513
7711
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7514
|
-
n_rot, rope_type,
|
|
7712
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7515
7713
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7516
7714
|
);
|
|
7517
7715
|
cb(Qcur, "Qcur", il);
|
|
7518
7716
|
|
|
7519
7717
|
Kcur = ggml_rope_ext(
|
|
7520
7718
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7521
|
-
n_rot, rope_type,
|
|
7719
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7522
7720
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7523
7721
|
);
|
|
7524
7722
|
cb(Kcur, "Kcur", il);
|
|
@@ -7631,13 +7829,13 @@ struct llm_build_context {
|
|
|
7631
7829
|
|
|
7632
7830
|
// using mode = 2 for neox mode
|
|
7633
7831
|
Qcur = ggml_rope_ext(
|
|
7634
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
7832
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
7635
7833
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7636
7834
|
);
|
|
7637
7835
|
cb(Qcur, "Qcur", il);
|
|
7638
7836
|
|
|
7639
7837
|
Kcur = ggml_rope_ext(
|
|
7640
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
7838
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
7641
7839
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7642
7840
|
);
|
|
7643
7841
|
cb(Kcur, "Kcur", il);
|
|
@@ -7755,14 +7953,14 @@ struct llm_build_context {
|
|
|
7755
7953
|
|
|
7756
7954
|
Qcur = ggml_rope_ext(
|
|
7757
7955
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7758
|
-
n_rot, rope_type,
|
|
7956
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7759
7957
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7760
7958
|
);
|
|
7761
7959
|
cb(Qcur, "Qcur", il);
|
|
7762
7960
|
|
|
7763
7961
|
Kcur = ggml_rope_ext(
|
|
7764
7962
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7765
|
-
n_rot, rope_type,
|
|
7963
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7766
7964
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7767
7965
|
);
|
|
7768
7966
|
cb(Kcur, "Kcur", il);
|
|
@@ -7806,6 +8004,7 @@ struct llm_build_context {
|
|
|
7806
8004
|
model.layers[il].ffn_down_exps,
|
|
7807
8005
|
n_expert, n_expert_used,
|
|
7808
8006
|
LLM_FFN_GELU, true,
|
|
8007
|
+
false, 0.0,
|
|
7809
8008
|
cb, il);
|
|
7810
8009
|
cb(cur, "ffn_moe_out", il);
|
|
7811
8010
|
|
|
@@ -7907,14 +8106,14 @@ struct llm_build_context {
|
|
|
7907
8106
|
|
|
7908
8107
|
Qcur = ggml_rope_ext(
|
|
7909
8108
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7910
|
-
n_rot, rope_type,
|
|
8109
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7911
8110
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7912
8111
|
);
|
|
7913
8112
|
cb(Qcur, "Qcur", il);
|
|
7914
8113
|
|
|
7915
8114
|
Kcur = ggml_rope_ext(
|
|
7916
8115
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7917
|
-
n_rot, rope_type,
|
|
8116
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7918
8117
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7919
8118
|
);
|
|
7920
8119
|
cb(Kcur, "Kcur", il);
|
|
@@ -7949,6 +8148,7 @@ struct llm_build_context {
|
|
|
7949
8148
|
model.layers[il].ffn_down_exps,
|
|
7950
8149
|
n_expert, n_expert_used,
|
|
7951
8150
|
LLM_FFN_SILU, true,
|
|
8151
|
+
false, 0.0,
|
|
7952
8152
|
cb, il);
|
|
7953
8153
|
cb(cur, "ffn_moe_out", il);
|
|
7954
8154
|
|
|
@@ -8260,14 +8460,14 @@ struct llm_build_context {
|
|
|
8260
8460
|
|
|
8261
8461
|
Qcur = ggml_rope_ext(
|
|
8262
8462
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8263
|
-
n_rot, rope_type,
|
|
8463
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8264
8464
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8265
8465
|
);
|
|
8266
8466
|
cb(Qcur, "Qcur", il);
|
|
8267
8467
|
|
|
8268
8468
|
Kcur = ggml_rope_ext(
|
|
8269
8469
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8270
|
-
n_rot, rope_type,
|
|
8470
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8271
8471
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8272
8472
|
);
|
|
8273
8473
|
cb(Kcur, "Kcur", il);
|
|
@@ -8700,14 +8900,14 @@ struct llm_build_context {
|
|
|
8700
8900
|
|
|
8701
8901
|
Qcur = ggml_rope_ext(
|
|
8702
8902
|
ctx0, Qcur, inp_pos, nullptr,
|
|
8703
|
-
n_rot, rope_type,
|
|
8903
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8704
8904
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8705
8905
|
);
|
|
8706
8906
|
cb(Qcur, "Qcur", il);
|
|
8707
8907
|
|
|
8708
8908
|
Kcur = ggml_rope_ext(
|
|
8709
8909
|
ctx0, Kcur, inp_pos, nullptr,
|
|
8710
|
-
n_rot, rope_type,
|
|
8910
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8711
8911
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8712
8912
|
);
|
|
8713
8913
|
cb(Kcur, "Kcur", il);
|
|
@@ -8819,13 +9019,13 @@ struct llm_build_context {
|
|
|
8819
9019
|
|
|
8820
9020
|
// using mode = 2 for neox mode
|
|
8821
9021
|
Qcur = ggml_rope_ext(
|
|
8822
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9022
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
8823
9023
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8824
9024
|
);
|
|
8825
9025
|
cb(Qcur, "Qcur", il);
|
|
8826
9026
|
|
|
8827
9027
|
Kcur = ggml_rope_ext(
|
|
8828
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9028
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
8829
9029
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8830
9030
|
);
|
|
8831
9031
|
cb(Kcur, "Kcur", il);
|
|
@@ -8931,14 +9131,14 @@ struct llm_build_context {
|
|
|
8931
9131
|
|
|
8932
9132
|
Qcur = ggml_rope_ext(
|
|
8933
9133
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8934
|
-
n_rot, rope_type,
|
|
9134
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8935
9135
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8936
9136
|
);
|
|
8937
9137
|
cb(Qcur, "Qcur", il);
|
|
8938
9138
|
|
|
8939
9139
|
Kcur = ggml_rope_ext(
|
|
8940
9140
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8941
|
-
n_rot, rope_type,
|
|
9141
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
8942
9142
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8943
9143
|
);
|
|
8944
9144
|
cb(Kcur, "Kcur", il);
|
|
@@ -9045,14 +9245,14 @@ struct llm_build_context {
|
|
|
9045
9245
|
|
|
9046
9246
|
Qcur = ggml_rope_ext(
|
|
9047
9247
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9048
|
-
n_rot, rope_type,
|
|
9248
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9049
9249
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9050
9250
|
);
|
|
9051
9251
|
cb(Qcur, "Qcur", il);
|
|
9052
9252
|
|
|
9053
9253
|
Kcur = ggml_rope_ext(
|
|
9054
9254
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9055
|
-
n_rot, rope_type,
|
|
9255
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9056
9256
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9057
9257
|
);
|
|
9058
9258
|
cb(Kcur, "Kcur", il);
|
|
@@ -9087,6 +9287,7 @@ struct llm_build_context {
|
|
|
9087
9287
|
model.layers[il].ffn_down_exps,
|
|
9088
9288
|
n_expert, n_expert_used,
|
|
9089
9289
|
LLM_FFN_SILU, false,
|
|
9290
|
+
false, 0.0,
|
|
9090
9291
|
cb, il);
|
|
9091
9292
|
cb(cur, "ffn_moe_out", il);
|
|
9092
9293
|
|
|
@@ -9196,7 +9397,7 @@ struct llm_build_context {
|
|
|
9196
9397
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9197
9398
|
|
|
9198
9399
|
Qcur = ggml_rope_ext(
|
|
9199
|
-
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9400
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
9200
9401
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9201
9402
|
);
|
|
9202
9403
|
cb(Qcur, "Qcur", il);
|
|
@@ -9207,7 +9408,7 @@ struct llm_build_context {
|
|
|
9207
9408
|
cb(Qcur, "Qcur", il);
|
|
9208
9409
|
|
|
9209
9410
|
Kcur = ggml_rope_ext(
|
|
9210
|
-
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type,
|
|
9411
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
9211
9412
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9212
9413
|
);
|
|
9213
9414
|
cb(Kcur, "Kcur", il);
|
|
@@ -9318,7 +9519,7 @@ struct llm_build_context {
|
|
|
9318
9519
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9319
9520
|
|
|
9320
9521
|
Qcur = ggml_rope_ext(
|
|
9321
|
-
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type,
|
|
9522
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
|
9322
9523
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9323
9524
|
);
|
|
9324
9525
|
cb(Qcur, "Qcur", il);
|
|
@@ -9327,7 +9528,7 @@ struct llm_build_context {
|
|
|
9327
9528
|
cb(Qcur, "Qcur", il);
|
|
9328
9529
|
|
|
9329
9530
|
Kcur = ggml_rope_ext(
|
|
9330
|
-
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type,
|
|
9531
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
|
|
9331
9532
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9332
9533
|
);
|
|
9333
9534
|
cb(Kcur, "Kcur", il);
|
|
@@ -9435,13 +9636,13 @@ struct llm_build_context {
|
|
|
9435
9636
|
|
|
9436
9637
|
Qcur = ggml_rope_ext(
|
|
9437
9638
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
|
9438
|
-
n_embd_head, rope_type,
|
|
9639
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9439
9640
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9440
9641
|
cb(Qcur, "Qcur", il);
|
|
9441
9642
|
|
|
9442
9643
|
Kcur = ggml_rope_ext(
|
|
9443
9644
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9444
|
-
n_embd_head, rope_type,
|
|
9645
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9445
9646
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9446
9647
|
cb(Kcur, "Kcur", il);
|
|
9447
9648
|
|
|
@@ -9643,14 +9844,14 @@ struct llm_build_context {
|
|
|
9643
9844
|
|
|
9644
9845
|
struct ggml_tensor * Qcur = ggml_rope_ext(
|
|
9645
9846
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9646
|
-
n_rot, rope_type,
|
|
9847
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9647
9848
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9648
9849
|
);
|
|
9649
9850
|
cb(Qcur, "Qcur", il);
|
|
9650
9851
|
|
|
9651
9852
|
struct ggml_tensor * Kcur = ggml_rope_ext(
|
|
9652
9853
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9653
|
-
n_rot, rope_type,
|
|
9854
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9654
9855
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9655
9856
|
);
|
|
9656
9857
|
cb(Kcur, "Kcur", il);
|
|
@@ -9759,14 +9960,14 @@ struct llm_build_context {
|
|
|
9759
9960
|
|
|
9760
9961
|
Qcur = ggml_rope_ext(
|
|
9761
9962
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9762
|
-
n_rot, rope_type,
|
|
9963
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9763
9964
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9764
9965
|
);
|
|
9765
9966
|
cb(Qcur, "Qcur", il);
|
|
9766
9967
|
|
|
9767
9968
|
Kcur = ggml_rope_ext(
|
|
9768
9969
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9769
|
-
n_rot, rope_type,
|
|
9970
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9770
9971
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9771
9972
|
);
|
|
9772
9973
|
cb(Kcur, "Kcur", il);
|
|
@@ -9876,14 +10077,14 @@ struct llm_build_context {
|
|
|
9876
10077
|
|
|
9877
10078
|
Qcur = ggml_rope_ext(
|
|
9878
10079
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9879
|
-
n_rot, rope_type,
|
|
10080
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9880
10081
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9881
10082
|
);
|
|
9882
10083
|
cb(Qcur, "Qcur", il);
|
|
9883
10084
|
|
|
9884
10085
|
Kcur = ggml_rope_ext(
|
|
9885
10086
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9886
|
-
n_rot, rope_type,
|
|
10087
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9887
10088
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9888
10089
|
);
|
|
9889
10090
|
cb(Kcur, "Kcur", il);
|
|
@@ -10006,14 +10207,14 @@ struct llm_build_context {
|
|
|
10006
10207
|
|
|
10007
10208
|
Qcur = ggml_rope_ext(
|
|
10008
10209
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10009
|
-
n_rot, rope_type,
|
|
10210
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10010
10211
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10011
10212
|
);
|
|
10012
10213
|
cb(Qcur, "Qcur", il);
|
|
10013
10214
|
|
|
10014
10215
|
Kcur = ggml_rope_ext(
|
|
10015
10216
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10016
|
-
n_rot, rope_type,
|
|
10217
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10017
10218
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10018
10219
|
);
|
|
10019
10220
|
cb(Kcur, "Kcur", il);
|
|
@@ -10078,7 +10279,7 @@ struct llm_build_context {
|
|
|
10078
10279
|
cb(cur, "lmhead_scaling", -1);
|
|
10079
10280
|
|
|
10080
10281
|
// lm_head
|
|
10081
|
-
cur = ggml_mul_mat(ctx0, model.
|
|
10282
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
10082
10283
|
cb(cur, "result_output", -1);
|
|
10083
10284
|
|
|
10084
10285
|
ggml_build_forward_expand(gf, cur);
|
|
@@ -10126,7 +10327,7 @@ struct llm_build_context {
|
|
|
10126
10327
|
|
|
10127
10328
|
Qcur = ggml_rope_ext(
|
|
10128
10329
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
|
10129
|
-
n_embd_head_k, rope_type,
|
|
10330
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10130
10331
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10131
10332
|
cb(Qcur, "Qcur", il);
|
|
10132
10333
|
|
|
@@ -10135,7 +10336,7 @@ struct llm_build_context {
|
|
|
10135
10336
|
|
|
10136
10337
|
Kcur = ggml_rope_ext(
|
|
10137
10338
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10138
|
-
n_embd_head_k, rope_type,
|
|
10339
|
+
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10139
10340
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10140
10341
|
cb(Kcur, "Kcur", il);
|
|
10141
10342
|
|
|
@@ -10246,14 +10447,14 @@ struct llm_build_context {
|
|
|
10246
10447
|
|
|
10247
10448
|
Qcur = ggml_rope_ext(
|
|
10248
10449
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10249
|
-
n_rot, rope_type,
|
|
10450
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10250
10451
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10251
10452
|
);
|
|
10252
10453
|
cb(Qcur, "Qcur", il);
|
|
10253
10454
|
|
|
10254
10455
|
Kcur = ggml_rope_ext(
|
|
10255
10456
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10256
|
-
n_rot, rope_type,
|
|
10457
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10257
10458
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10258
10459
|
);
|
|
10259
10460
|
cb(Kcur, "Kcur", il);
|
|
@@ -10536,14 +10737,14 @@ struct llm_build_context {
|
|
|
10536
10737
|
|
|
10537
10738
|
Qcur = ggml_rope_ext(
|
|
10538
10739
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10539
|
-
n_rot, rope_type,
|
|
10740
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10540
10741
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10541
10742
|
);
|
|
10542
10743
|
cb(Qcur, "Qcur", il);
|
|
10543
10744
|
|
|
10544
10745
|
Kcur = ggml_rope_ext(
|
|
10545
10746
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10546
|
-
n_rot, rope_type,
|
|
10747
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10547
10748
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10548
10749
|
);
|
|
10549
10750
|
cb(Kcur, "Kcur", il);
|
|
@@ -10667,14 +10868,14 @@ struct llm_build_context {
|
|
|
10667
10868
|
|
|
10668
10869
|
Qcur = ggml_rope_ext(
|
|
10669
10870
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10670
|
-
n_rot, rope_type,
|
|
10871
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10671
10872
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10672
10873
|
);
|
|
10673
10874
|
cb(Qcur, "Qcur", il);
|
|
10674
10875
|
|
|
10675
10876
|
Kcur = ggml_rope_ext(
|
|
10676
10877
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10677
|
-
n_rot, rope_type,
|
|
10878
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10678
10879
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10679
10880
|
);
|
|
10680
10881
|
cb(Kcur, "Kcur", il);
|
|
@@ -10781,14 +10982,14 @@ struct llm_build_context {
|
|
|
10781
10982
|
|
|
10782
10983
|
Qcur = ggml_rope_ext(
|
|
10783
10984
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10784
|
-
n_rot, rope_type,
|
|
10985
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10785
10986
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10786
10987
|
);
|
|
10787
10988
|
cb(Qcur, "Qcur", il);
|
|
10788
10989
|
|
|
10789
10990
|
Kcur = ggml_rope_ext(
|
|
10790
10991
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10791
|
-
n_rot, rope_type,
|
|
10992
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10792
10993
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10793
10994
|
);
|
|
10794
10995
|
cb(Kcur, "Kcur", il);
|
|
@@ -10916,14 +11117,14 @@ struct llm_build_context {
|
|
|
10916
11117
|
|
|
10917
11118
|
Qcur = ggml_rope_ext(
|
|
10918
11119
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10919
|
-
n_rot, rope_type,
|
|
11120
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10920
11121
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10921
11122
|
);
|
|
10922
11123
|
cb(Qcur, "Qcur", il);
|
|
10923
11124
|
|
|
10924
11125
|
Kcur = ggml_rope_ext(
|
|
10925
11126
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10926
|
-
n_rot, rope_type,
|
|
11127
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10927
11128
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10928
11129
|
);
|
|
10929
11130
|
cb(Kcur, "Kcur", il);
|
|
@@ -10974,6 +11175,7 @@ struct llm_build_context {
|
|
|
10974
11175
|
model.layers[il].ffn_down_exps,
|
|
10975
11176
|
n_expert, n_expert_used,
|
|
10976
11177
|
LLM_FFN_SILU, true,
|
|
11178
|
+
false, 0.0,
|
|
10977
11179
|
cb, il);
|
|
10978
11180
|
cb(cur, "ffn_moe_out", il);
|
|
10979
11181
|
|
|
@@ -11005,6 +11207,239 @@ struct llm_build_context {
|
|
|
11005
11207
|
|
|
11006
11208
|
return gf;
|
|
11007
11209
|
}
|
|
11210
|
+
|
|
11211
|
+
struct ggml_cgraph * build_deepseek2() {
|
|
11212
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
11213
|
+
|
|
11214
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
11215
|
+
int32_t n_tokens = this->n_tokens;
|
|
11216
|
+
|
|
11217
|
+
bool is_lite = (hparams.n_layer == 27);
|
|
11218
|
+
|
|
11219
|
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
|
11220
|
+
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
11221
|
+
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
|
11222
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
|
|
11223
|
+
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
11224
|
+
|
|
11225
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
11226
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
11227
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
11228
|
+
|
|
11229
|
+
struct ggml_tensor * cur;
|
|
11230
|
+
struct ggml_tensor * inpL;
|
|
11231
|
+
|
|
11232
|
+
// {n_embd, n_tokens}
|
|
11233
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
11234
|
+
|
|
11235
|
+
// inp_pos - contains the positions
|
|
11236
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
11237
|
+
|
|
11238
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
11239
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
11240
|
+
|
|
11241
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
11242
|
+
struct ggml_tensor * inpSA = inpL;
|
|
11243
|
+
|
|
11244
|
+
// norm
|
|
11245
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
11246
|
+
model.layers[il].attn_norm, NULL,
|
|
11247
|
+
LLM_NORM_RMS, cb, il);
|
|
11248
|
+
cb(cur, "attn_norm", il);
|
|
11249
|
+
|
|
11250
|
+
// self_attention
|
|
11251
|
+
{
|
|
11252
|
+
struct ggml_tensor * q = NULL;
|
|
11253
|
+
if (!is_lite) {
|
|
11254
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
|
11255
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
|
11256
|
+
cb(q, "q", il);
|
|
11257
|
+
|
|
11258
|
+
q = llm_build_norm(ctx0, q, hparams,
|
|
11259
|
+
model.layers[il].attn_q_a_norm, NULL,
|
|
11260
|
+
LLM_NORM_RMS, cb, il);
|
|
11261
|
+
cb(q, "q", il);
|
|
11262
|
+
|
|
11263
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
|
11264
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
|
11265
|
+
cb(q, "q", il);
|
|
11266
|
+
} else {
|
|
11267
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
11268
|
+
cb(q, "q", il);
|
|
11269
|
+
}
|
|
11270
|
+
|
|
11271
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
11272
|
+
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
11273
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
11274
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
11275
|
+
0);
|
|
11276
|
+
cb(q_nope, "q_nope", il);
|
|
11277
|
+
|
|
11278
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
11279
|
+
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
11280
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
11281
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
11282
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
11283
|
+
cb(q_pe, "q_pe", il);
|
|
11284
|
+
|
|
11285
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
11286
|
+
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
11287
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
11288
|
+
|
|
11289
|
+
// split into {kv_lora_rank, n_tokens}
|
|
11290
|
+
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
11291
|
+
kv_pe_compresseed->nb[1],
|
|
11292
|
+
0);
|
|
11293
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
11294
|
+
|
|
11295
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
|
11296
|
+
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
11297
|
+
kv_pe_compresseed->nb[1],
|
|
11298
|
+
kv_pe_compresseed->nb[1],
|
|
11299
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
11300
|
+
cb(k_pe, "k_pe", il);
|
|
11301
|
+
|
|
11302
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
|
11303
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
|
11304
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
|
11305
|
+
LLM_NORM_RMS, cb, il);
|
|
11306
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
11307
|
+
|
|
11308
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
|
11309
|
+
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
|
11310
|
+
cb(kv, "kv", il);
|
|
11311
|
+
|
|
11312
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
11313
|
+
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
|
11314
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
|
11315
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
11316
|
+
0);
|
|
11317
|
+
cb(k_nope, "k_nope", il);
|
|
11318
|
+
|
|
11319
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
|
11320
|
+
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
|
11321
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
11322
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
11323
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
11324
|
+
cb(v_states, "v_states", il);
|
|
11325
|
+
|
|
11326
|
+
v_states = ggml_cont(ctx0, v_states);
|
|
11327
|
+
cb(v_states, "v_states", il);
|
|
11328
|
+
|
|
11329
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
11330
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
11331
|
+
0);
|
|
11332
|
+
cb(v_states, "v_states", il);
|
|
11333
|
+
|
|
11334
|
+
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
|
11335
|
+
q_pe = ggml_rope_ext(
|
|
11336
|
+
ctx0, q_pe, inp_pos, nullptr,
|
|
11337
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11338
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
11339
|
+
);
|
|
11340
|
+
cb(q_pe, "q_pe", il);
|
|
11341
|
+
|
|
11342
|
+
// shared RoPE key
|
|
11343
|
+
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
|
11344
|
+
k_pe = ggml_rope_ext(
|
|
11345
|
+
ctx0, k_pe, inp_pos, nullptr,
|
|
11346
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11347
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
11348
|
+
);
|
|
11349
|
+
cb(k_pe, "k_pe", il);
|
|
11350
|
+
|
|
11351
|
+
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
|
11352
|
+
cb(q_states, "q_states", il);
|
|
11353
|
+
|
|
11354
|
+
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
11355
|
+
cb(k_states, "k_states", il);
|
|
11356
|
+
|
|
11357
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
11358
|
+
model.layers[il].wo, NULL,
|
|
11359
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
|
11360
|
+
}
|
|
11361
|
+
|
|
11362
|
+
if (il == n_layer - 1) {
|
|
11363
|
+
// skip computing output for unused tokens
|
|
11364
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11365
|
+
n_tokens = n_outputs;
|
|
11366
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11367
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11368
|
+
}
|
|
11369
|
+
|
|
11370
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
11371
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
11372
|
+
|
|
11373
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
|
11374
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
11375
|
+
model.layers[il].ffn_norm, NULL,
|
|
11376
|
+
LLM_NORM_RMS, cb, il);
|
|
11377
|
+
cb(cur, "ffn_norm", il);
|
|
11378
|
+
|
|
11379
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
11380
|
+
model.layers[il].ffn_up, NULL,
|
|
11381
|
+
model.layers[il].ffn_gate, NULL,
|
|
11382
|
+
model.layers[il].ffn_down, NULL,
|
|
11383
|
+
NULL,
|
|
11384
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
11385
|
+
cb(cur, "ffn_out", il);
|
|
11386
|
+
} else {
|
|
11387
|
+
// MoE branch
|
|
11388
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
11389
|
+
model.layers[il].ffn_norm, NULL,
|
|
11390
|
+
LLM_NORM_RMS, cb, il);
|
|
11391
|
+
cb(cur, "ffn_norm", il);
|
|
11392
|
+
|
|
11393
|
+
ggml_tensor * moe_out =
|
|
11394
|
+
llm_build_moe_ffn(ctx0, cur,
|
|
11395
|
+
model.layers[il].ffn_gate_inp,
|
|
11396
|
+
model.layers[il].ffn_up_exps,
|
|
11397
|
+
model.layers[il].ffn_gate_exps,
|
|
11398
|
+
model.layers[il].ffn_down_exps,
|
|
11399
|
+
n_expert, n_expert_used,
|
|
11400
|
+
LLM_FFN_SILU, false,
|
|
11401
|
+
true, hparams.expert_weights_scale,
|
|
11402
|
+
cb, il);
|
|
11403
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
11404
|
+
|
|
11405
|
+
// FFN shared expert
|
|
11406
|
+
{
|
|
11407
|
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
|
|
11408
|
+
model.layers[il].ffn_up_shexp, NULL,
|
|
11409
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
|
11410
|
+
model.layers[il].ffn_down_shexp, NULL,
|
|
11411
|
+
NULL,
|
|
11412
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
11413
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
11414
|
+
|
|
11415
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
11416
|
+
cb(cur, "ffn_out", il);
|
|
11417
|
+
}
|
|
11418
|
+
}
|
|
11419
|
+
|
|
11420
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
11421
|
+
cb(cur, "l_out", il);
|
|
11422
|
+
|
|
11423
|
+
// input for next layer
|
|
11424
|
+
inpL = cur;
|
|
11425
|
+
}
|
|
11426
|
+
|
|
11427
|
+
cur = inpL;
|
|
11428
|
+
|
|
11429
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
11430
|
+
model.output_norm, NULL,
|
|
11431
|
+
LLM_NORM_RMS, cb, -1);
|
|
11432
|
+
cb(cur, "result_norm", -1);
|
|
11433
|
+
|
|
11434
|
+
// lm_head
|
|
11435
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
11436
|
+
cb(cur, "result_output", -1);
|
|
11437
|
+
|
|
11438
|
+
ggml_build_forward_expand(gf, cur);
|
|
11439
|
+
|
|
11440
|
+
return gf;
|
|
11441
|
+
}
|
|
11442
|
+
|
|
11008
11443
|
};
|
|
11009
11444
|
|
|
11010
11445
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
|
@@ -11223,6 +11658,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
11223
11658
|
{
|
|
11224
11659
|
result = llm.build_arctic();
|
|
11225
11660
|
} break;
|
|
11661
|
+
case LLM_ARCH_DEEPSEEK2:
|
|
11662
|
+
{
|
|
11663
|
+
result = llm.build_deepseek2();
|
|
11664
|
+
} break;
|
|
11226
11665
|
default:
|
|
11227
11666
|
GGML_ASSERT(false);
|
|
11228
11667
|
}
|
|
@@ -12239,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
|
12239
12678
|
|
|
12240
12679
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
|
12241
12680
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12242
|
-
return vocab.id_to_token[id].
|
|
12681
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
|
12243
12682
|
}
|
|
12244
12683
|
|
|
12245
12684
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
|
12246
12685
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12247
|
-
return vocab.id_to_token[id].
|
|
12686
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
|
12248
12687
|
}
|
|
12249
12688
|
|
|
12250
12689
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
12251
12690
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12252
|
-
return vocab.id_to_token[id].
|
|
12691
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
|
12253
12692
|
}
|
|
12254
12693
|
|
|
12255
12694
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
12256
12695
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12257
|
-
return vocab.id_to_token[id].
|
|
12696
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
|
12258
12697
|
}
|
|
12259
12698
|
|
|
12260
12699
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
|
12261
12700
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
12262
|
-
return vocab.id_to_token[id].
|
|
12701
|
+
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
12263
12702
|
}
|
|
12264
12703
|
|
|
12265
12704
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
@@ -12512,6 +12951,7 @@ struct llm_tokenizer_bpe {
|
|
|
12512
12951
|
});
|
|
12513
12952
|
break;
|
|
12514
12953
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
|
12954
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
|
12515
12955
|
word_collection = unicode_regex_split(text, {
|
|
12516
12956
|
// same as llama3
|
|
12517
12957
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
@@ -12734,7 +13174,7 @@ struct llm_tokenizer_wpm {
|
|
|
12734
13174
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
|
12735
13175
|
|
|
12736
13176
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
12737
|
-
auto
|
|
13177
|
+
const auto & token_map = vocab.token_to_id;
|
|
12738
13178
|
|
|
12739
13179
|
// normalize and split by whitespace
|
|
12740
13180
|
std::vector<std::string> words = preprocess(text);
|
|
@@ -12749,108 +13189,89 @@ struct llm_tokenizer_wpm {
|
|
|
12749
13189
|
}
|
|
12750
13190
|
|
|
12751
13191
|
// prepend phantom space
|
|
12752
|
-
std::string word1 = "\xe2\x96\x81" + word;
|
|
12753
|
-
int n = word1.size();
|
|
13192
|
+
const std::string word1 = "\xe2\x96\x81" + word;
|
|
13193
|
+
const int n = word1.size();
|
|
12754
13194
|
|
|
12755
|
-
|
|
12756
|
-
int i = 0;
|
|
12757
|
-
bool match_any = false;
|
|
13195
|
+
const size_t current_tokens = output.size();
|
|
12758
13196
|
|
|
13197
|
+
// we're at the start of a new word
|
|
12759
13198
|
// move through character position in word
|
|
12760
|
-
|
|
13199
|
+
for (int i = 0; i < n; ++i) {
|
|
12761
13200
|
// loop through possible match length
|
|
12762
13201
|
bool match = false;
|
|
12763
13202
|
for (int j = n; j > i; j--) {
|
|
12764
|
-
auto it = token_map
|
|
12765
|
-
if (it != token_map
|
|
13203
|
+
auto it = token_map.find(word1.substr(i, j - i));
|
|
13204
|
+
if (it != token_map.end()) {
|
|
12766
13205
|
output.push_back(it->second);
|
|
12767
13206
|
match = true;
|
|
12768
|
-
|
|
12769
|
-
i = j;
|
|
13207
|
+
i = j - 1;
|
|
12770
13208
|
break;
|
|
12771
13209
|
}
|
|
12772
13210
|
}
|
|
12773
13211
|
|
|
12774
|
-
|
|
12775
|
-
|
|
12776
|
-
|
|
13212
|
+
if (!match) { // discard all
|
|
13213
|
+
output.resize(current_tokens);
|
|
13214
|
+
break; // and discard next tokens
|
|
12777
13215
|
}
|
|
12778
13216
|
}
|
|
12779
13217
|
|
|
12780
13218
|
// we didn't find any matches for this word
|
|
12781
|
-
if (
|
|
13219
|
+
if (current_tokens == output.size()) {
|
|
12782
13220
|
output.push_back(vocab.special_unk_id);
|
|
12783
13221
|
}
|
|
12784
13222
|
}
|
|
12785
13223
|
}
|
|
12786
13224
|
|
|
12787
13225
|
std::vector<std::string> preprocess(const std::string & text) {
|
|
12788
|
-
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
|
12789
|
-
|
|
12790
|
-
|
|
12791
|
-
|
|
12792
|
-
|
|
12793
|
-
|
|
12794
|
-
|
|
12795
|
-
|
|
13226
|
+
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
|
13227
|
+
std::vector<std::string> words(1, "");
|
|
13228
|
+
|
|
13229
|
+
for (const char32_t cpt : cpts_nfd) {
|
|
13230
|
+
const auto flags = unicode_cpt_flags(cpt);
|
|
13231
|
+
|
|
13232
|
+
if (flags.is_whitespace) {
|
|
13233
|
+
if (words.back().size()) { // finish previous word if any
|
|
13234
|
+
words.emplace_back();
|
|
13235
|
+
}
|
|
12796
13236
|
continue;
|
|
12797
13237
|
}
|
|
12798
|
-
|
|
12799
|
-
|
|
12800
|
-
|
|
12801
|
-
|
|
12802
|
-
std::string s = unicode_cpt_to_utf8(code);
|
|
12803
|
-
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
|
12804
|
-
new_str += " ";
|
|
12805
|
-
new_str += s;
|
|
12806
|
-
new_str += " ";
|
|
12807
|
-
} else {
|
|
12808
|
-
new_str += s;
|
|
13238
|
+
|
|
13239
|
+
assert (!flags.is_separator);
|
|
13240
|
+
if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
|
|
13241
|
+
continue;
|
|
12809
13242
|
}
|
|
12810
|
-
}
|
|
12811
13243
|
|
|
12812
|
-
|
|
12813
|
-
|
|
12814
|
-
|
|
12815
|
-
|
|
12816
|
-
|
|
12817
|
-
|
|
12818
|
-
|
|
12819
|
-
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
|
12820
|
-
l = r + 1;
|
|
12821
|
-
r = l;
|
|
13244
|
+
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
|
13245
|
+
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
|
13246
|
+
if (words.back().size()) { // finish previous word if any
|
|
13247
|
+
words.emplace_back();
|
|
13248
|
+
}
|
|
13249
|
+
words.back() = s; // single char word
|
|
13250
|
+
words.emplace_back(); // start a new word
|
|
12822
13251
|
} else {
|
|
12823
|
-
|
|
13252
|
+
words.back() += s; // append char to word
|
|
12824
13253
|
}
|
|
12825
13254
|
}
|
|
12826
|
-
if (r > l) {
|
|
12827
|
-
words.push_back(new_str.substr(l, (r - l)));
|
|
12828
|
-
}
|
|
12829
|
-
return words;
|
|
12830
|
-
}
|
|
12831
13255
|
|
|
12832
|
-
|
|
12833
|
-
|
|
12834
|
-
return false;
|
|
13256
|
+
if (!words.back().size()) {
|
|
13257
|
+
words.pop_back();
|
|
12835
13258
|
}
|
|
12836
|
-
|
|
12837
|
-
return
|
|
13259
|
+
|
|
13260
|
+
return words;
|
|
12838
13261
|
}
|
|
12839
13262
|
|
|
12840
|
-
bool is_chinese_char(uint32_t cpt) {
|
|
12841
|
-
|
|
12842
|
-
(cpt >=
|
|
13263
|
+
static bool is_chinese_char(uint32_t cpt) {
|
|
13264
|
+
return
|
|
13265
|
+
(cpt >= 0x04E00 && cpt <= 0x09FFF) ||
|
|
13266
|
+
(cpt >= 0x03400 && cpt <= 0x04DBF) ||
|
|
12843
13267
|
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
|
12844
13268
|
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
|
12845
13269
|
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
|
12846
13270
|
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
|
12847
|
-
(cpt >=
|
|
12848
|
-
(cpt >= 0x2F800 && cpt <= 0x2FA1F)
|
|
12849
|
-
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
|
12850
|
-
(cpt >= 0xFF00 && cpt <= 0xFFEF)
|
|
12851
|
-
return true; // NOLINT
|
|
12852
|
-
}
|
|
12853
|
-
return false;
|
|
13271
|
+
(cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
|
|
13272
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F);
|
|
13273
|
+
//(cpt >= 0x3000 && cpt <= 0x303F) ||
|
|
13274
|
+
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
|
12854
13275
|
}
|
|
12855
13276
|
|
|
12856
13277
|
const llama_vocab & vocab;
|
|
@@ -12894,9 +13315,9 @@ struct fragment_buffer_variant {
|
|
|
12894
13315
|
|
|
12895
13316
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
|
12896
13317
|
// for each special token
|
|
12897
|
-
for (const
|
|
12898
|
-
const auto &
|
|
12899
|
-
const auto &
|
|
13318
|
+
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
|
13319
|
+
const auto & data = vocab.id_to_token[special_id];
|
|
13320
|
+
const auto & special_token = data.text;
|
|
12900
13321
|
|
|
12901
13322
|
// for each text fragment
|
|
12902
13323
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
|
@@ -12905,7 +13326,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
12905
13326
|
|
|
12906
13327
|
// if a fragment is text ( not yet processed )
|
|
12907
13328
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
12908
|
-
auto
|
|
13329
|
+
auto & raw_text = fragment.raw_text;
|
|
12909
13330
|
|
|
12910
13331
|
auto raw_text_base_offset = fragment.offset;
|
|
12911
13332
|
auto raw_text_base_length = fragment.length;
|
|
@@ -12915,7 +13336,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
12915
13336
|
// find the first occurrence of a given special token in this fragment
|
|
12916
13337
|
// passing offset argument only limit the "search area" but match coordinates
|
|
12917
13338
|
// are still relative to the source full raw_text
|
|
12918
|
-
auto match = raw_text
|
|
13339
|
+
auto match = raw_text.find(special_token, raw_text_base_offset);
|
|
12919
13340
|
|
|
12920
13341
|
// no occurrences found, stop processing this fragment for a given special token
|
|
12921
13342
|
if (match == std::string::npos) break;
|
|
@@ -12933,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
12933
13354
|
if (match > raw_text_base_offset) {
|
|
12934
13355
|
// left
|
|
12935
13356
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
|
12936
|
-
|
|
12937
|
-
|
|
13357
|
+
int64_t left_reminder_length = match - raw_text_base_offset;
|
|
13358
|
+
|
|
13359
|
+
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
|
13360
|
+
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
|
13361
|
+
left_reminder_length--;
|
|
13362
|
+
}
|
|
13363
|
+
}
|
|
13364
|
+
|
|
13365
|
+
if (left_reminder_length > 0) {
|
|
13366
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
|
13367
|
+
it++;
|
|
13368
|
+
}
|
|
12938
13369
|
|
|
12939
13370
|
#ifdef PRETOKENIZERDEBUG
|
|
12940
13371
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
|
12941
13372
|
#endif
|
|
12942
|
-
it++;
|
|
12943
13373
|
}
|
|
12944
13374
|
|
|
12945
13375
|
// special token
|
|
@@ -12948,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
12948
13378
|
|
|
12949
13379
|
// right
|
|
12950
13380
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
|
12951
|
-
|
|
12952
|
-
|
|
12953
|
-
|
|
13381
|
+
int64_t right_reminder_offset = match + special_token.length();
|
|
13382
|
+
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
|
13383
|
+
|
|
13384
|
+
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
|
13385
|
+
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
|
13386
|
+
right_reminder_offset++;
|
|
13387
|
+
right_reminder_length--;
|
|
13388
|
+
}
|
|
13389
|
+
}
|
|
13390
|
+
|
|
13391
|
+
if (right_reminder_length > 0) {
|
|
13392
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
|
13393
|
+
it++;
|
|
13394
|
+
}
|
|
12954
13395
|
|
|
12955
13396
|
#ifdef PRETOKENIZERDEBUG
|
|
12956
13397
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
|
12957
13398
|
#endif
|
|
12958
13399
|
|
|
12959
|
-
it++;
|
|
12960
|
-
|
|
12961
13400
|
if (source == 0) {
|
|
12962
13401
|
buffer.erase_after(buffer.before_begin());
|
|
12963
13402
|
} else {
|
|
@@ -13003,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13003
13442
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
|
13004
13443
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
|
13005
13444
|
|
|
13006
|
-
static const bool rtrim = true; //TODO: as param
|
|
13007
13445
|
bool is_prev_special = false;
|
|
13008
|
-
bool special_token_rtrim = false;
|
|
13009
13446
|
|
|
13010
13447
|
if (add_special && vocab.special_add_bos != 0) {
|
|
13011
13448
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
@@ -13015,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13015
13452
|
|
|
13016
13453
|
for (const auto & fragment : fragment_buffer) {
|
|
13017
13454
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
13018
|
-
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
|
13019
|
-
|
|
13020
|
-
// TODO: It's likely possible to get rid of this string copy entirely
|
|
13021
|
-
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
|
13022
|
-
// and passing 'add space prefix' as bool argument
|
|
13023
|
-
//
|
|
13024
13455
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
13025
13456
|
|
|
13026
|
-
if (special_token_rtrim) {
|
|
13027
|
-
size_t num_whitespaces = 0;
|
|
13028
|
-
while (isspace(raw_text[num_whitespaces])) {
|
|
13029
|
-
num_whitespaces++;
|
|
13030
|
-
}
|
|
13031
|
-
if (num_whitespaces == raw_text.size()) {
|
|
13032
|
-
continue; // skip if all whitespaces
|
|
13033
|
-
}
|
|
13034
|
-
raw_text = raw_text.substr(num_whitespaces);
|
|
13035
|
-
}
|
|
13036
|
-
|
|
13037
13457
|
if (vocab.add_space_prefix) {
|
|
13038
13458
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
|
13039
13459
|
raw_text = " " + raw_text;
|
|
@@ -13049,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
13049
13469
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
13050
13470
|
output.push_back(fragment.token);
|
|
13051
13471
|
is_prev_special = true;
|
|
13052
|
-
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
|
13053
|
-
special_token_rtrim = rtrim
|
|
13054
|
-
&& fragment.token != vocab.special_bos_id
|
|
13055
|
-
&& fragment.token != vocab.special_unk_id
|
|
13056
|
-
&& fragment.token != vocab.special_eos_id;
|
|
13057
13472
|
}
|
|
13058
13473
|
}
|
|
13059
13474
|
|
|
@@ -14054,7 +14469,7 @@ void llama_sample_repetition_penalties(
|
|
|
14054
14469
|
|
|
14055
14470
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
|
14056
14471
|
GGML_ASSERT(ctx);
|
|
14057
|
-
|
|
14472
|
+
int64_t t_start_sample_us = ggml_time_us();
|
|
14058
14473
|
|
|
14059
14474
|
bool allow_eog = false;
|
|
14060
14475
|
for (const auto & stack : grammar->stacks) {
|
|
@@ -14066,12 +14481,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
|
14066
14481
|
|
|
14067
14482
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
|
14068
14483
|
candidates_decoded.reserve(candidates->size);
|
|
14069
|
-
|
|
14484
|
+
|
|
14485
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
|
14070
14486
|
candidates_grammar.reserve(candidates->size);
|
|
14071
14487
|
|
|
14072
14488
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
14073
|
-
const llama_token id
|
|
14074
|
-
const std::string piece =
|
|
14489
|
+
const llama_token id = candidates->data[i].id;
|
|
14490
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
|
14075
14491
|
|
|
14076
14492
|
if (llama_token_is_eog(&ctx->model, id)) {
|
|
14077
14493
|
if (!allow_eog) {
|
|
@@ -14271,7 +14687,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
|
14271
14687
|
GGML_ASSERT(false);
|
|
14272
14688
|
}
|
|
14273
14689
|
|
|
14274
|
-
const std::string piece =
|
|
14690
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
|
14275
14691
|
|
|
14276
14692
|
// Note terminating 0 in decoded string
|
|
14277
14693
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
|
@@ -14287,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
|
14287
14703
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
14288
14704
|
}
|
|
14289
14705
|
|
|
14290
|
-
//
|
|
14291
|
-
// Beam search
|
|
14292
|
-
//
|
|
14293
|
-
|
|
14294
|
-
struct llama_beam {
|
|
14295
|
-
std::vector<llama_token> tokens;
|
|
14296
|
-
float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
14297
|
-
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
|
14298
|
-
// Sort beams by probability. In case of ties, prefer beams at eob.
|
|
14299
|
-
bool operator<(const llama_beam & rhs) const {
|
|
14300
|
-
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
|
14301
|
-
}
|
|
14302
|
-
// Shift off first n tokens and discard them.
|
|
14303
|
-
void shift_tokens(const size_t n) {
|
|
14304
|
-
if (n) {
|
|
14305
|
-
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
|
14306
|
-
tokens.resize(tokens.size() - n);
|
|
14307
|
-
}
|
|
14308
|
-
}
|
|
14309
|
-
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
|
14310
|
-
};
|
|
14311
|
-
|
|
14312
|
-
// A struct for calculating logit-related info.
|
|
14313
|
-
struct llama_logit_info {
|
|
14314
|
-
const float * const logits;
|
|
14315
|
-
const int n_vocab;
|
|
14316
|
-
const float max_l;
|
|
14317
|
-
const float normalizer;
|
|
14318
|
-
struct sum_exp {
|
|
14319
|
-
float max_l;
|
|
14320
|
-
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
|
14321
|
-
};
|
|
14322
|
-
llama_logit_info(llama_context * ctx)
|
|
14323
|
-
: logits(llama_get_logits(ctx))
|
|
14324
|
-
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
|
14325
|
-
, max_l(*std::max_element(logits, logits + n_vocab))
|
|
14326
|
-
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
|
14327
|
-
{ }
|
|
14328
|
-
llama_token_data get_token_data(const llama_token token_id) const {
|
|
14329
|
-
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
|
14330
|
-
return {token_id, logits[token_id], p};
|
|
14331
|
-
}
|
|
14332
|
-
// Return top k token_data by logit.
|
|
14333
|
-
std::vector<llama_token_data> top_k(size_t k) {
|
|
14334
|
-
std::vector<llama_token_data> min_heap; // min-heap by logit
|
|
14335
|
-
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
|
14336
|
-
min_heap.reserve(k_min);
|
|
14337
|
-
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
|
14338
|
-
min_heap.push_back(get_token_data(token_id));
|
|
14339
|
-
}
|
|
14340
|
-
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
|
14341
|
-
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14342
|
-
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
|
14343
|
-
if (min_heap.front().logit < logits[token_id]) {
|
|
14344
|
-
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14345
|
-
min_heap.back().id = token_id;
|
|
14346
|
-
min_heap.back().logit = logits[token_id];
|
|
14347
|
-
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
|
14348
|
-
}
|
|
14349
|
-
}
|
|
14350
|
-
return min_heap;
|
|
14351
|
-
}
|
|
14352
|
-
float probability_from_logit(float logit) const {
|
|
14353
|
-
return normalizer * std::exp(logit - max_l);
|
|
14354
|
-
}
|
|
14355
|
-
};
|
|
14356
|
-
|
|
14357
|
-
struct llama_beam_search_data {
|
|
14358
|
-
llama_context * ctx;
|
|
14359
|
-
size_t n_beams;
|
|
14360
|
-
int n_past;
|
|
14361
|
-
int n_predict;
|
|
14362
|
-
std::vector<llama_beam> beams;
|
|
14363
|
-
std::vector<llama_beam> next_beams;
|
|
14364
|
-
|
|
14365
|
-
// Re-calculated on each loop iteration
|
|
14366
|
-
size_t common_prefix_length;
|
|
14367
|
-
|
|
14368
|
-
// Used to communicate to/from callback on beams state.
|
|
14369
|
-
std::vector<llama_beam_view> beam_views;
|
|
14370
|
-
|
|
14371
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
|
14372
|
-
: ctx(ctx)
|
|
14373
|
-
, n_beams(n_beams)
|
|
14374
|
-
, n_past(n_past)
|
|
14375
|
-
, n_predict(n_predict)
|
|
14376
|
-
, beam_views(n_beams) {
|
|
14377
|
-
beams.reserve(n_beams);
|
|
14378
|
-
next_beams.reserve(n_beams);
|
|
14379
|
-
}
|
|
14380
|
-
|
|
14381
|
-
// Collapse beams to a single beam given by index.
|
|
14382
|
-
void collapse_beams(const size_t beam_idx) {
|
|
14383
|
-
if (0u < beam_idx) {
|
|
14384
|
-
std::swap(beams[0], beams[beam_idx]);
|
|
14385
|
-
}
|
|
14386
|
-
beams.resize(1);
|
|
14387
|
-
}
|
|
14388
|
-
|
|
14389
|
-
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
|
14390
|
-
// The repetitive patterns below reflect the 2 stages of heaps:
|
|
14391
|
-
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
|
14392
|
-
// * If the heap is full and a new element is found that should be included, pop the
|
|
14393
|
-
// least element to the back(), replace it with the new, then push it into the heap.
|
|
14394
|
-
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
|
14395
|
-
// Min-heaps use a greater-than comparator.
|
|
14396
|
-
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
|
14397
|
-
if (beam.eob) {
|
|
14398
|
-
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
|
14399
|
-
if (next_beams.size() < n_beams) {
|
|
14400
|
-
next_beams.push_back(std::move(beam));
|
|
14401
|
-
if (next_beams.size() == n_beams) {
|
|
14402
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14403
|
-
}
|
|
14404
|
-
} else if (next_beams.front().p < beam.p) {
|
|
14405
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14406
|
-
next_beams.back() = std::move(beam);
|
|
14407
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14408
|
-
}
|
|
14409
|
-
} else {
|
|
14410
|
-
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
|
14411
|
-
if (!beam.tokens.empty()) {
|
|
14412
|
-
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
|
14413
|
-
}
|
|
14414
|
-
llama_logit_info logit_info(ctx);
|
|
14415
|
-
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
|
14416
|
-
|
|
14417
|
-
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
|
14418
|
-
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
|
14419
|
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
14420
|
-
|
|
14421
|
-
size_t i=0;
|
|
14422
|
-
if (next_beams.size() < n_beams) {
|
|
14423
|
-
for (; next_beams.size() < n_beams ; ++i) {
|
|
14424
|
-
llama_beam next_beam = beam;
|
|
14425
|
-
next_beam.tokens.push_back(next_tokens[i].id);
|
|
14426
|
-
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14427
|
-
next_beams.push_back(std::move(next_beam));
|
|
14428
|
-
}
|
|
14429
|
-
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14430
|
-
} else {
|
|
14431
|
-
for (; next_beams.front().p == 0.0f ; ++i) {
|
|
14432
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14433
|
-
next_beams.back() = beam;
|
|
14434
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
14435
|
-
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14436
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14437
|
-
}
|
|
14438
|
-
}
|
|
14439
|
-
for (; i < n_beams ; ++i) {
|
|
14440
|
-
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
|
14441
|
-
if (next_beams.front().p < next_p) {
|
|
14442
|
-
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14443
|
-
next_beams.back() = beam;
|
|
14444
|
-
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
14445
|
-
next_beams.back().p = next_p;
|
|
14446
|
-
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
14447
|
-
}
|
|
14448
|
-
}
|
|
14449
|
-
}
|
|
14450
|
-
}
|
|
14451
|
-
|
|
14452
|
-
// Find common_prefix_length based on beams.
|
|
14453
|
-
// Requires beams is not empty.
|
|
14454
|
-
size_t find_common_prefix_length() {
|
|
14455
|
-
size_t common_prefix_length = beams[0].tokens.size();
|
|
14456
|
-
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
|
14457
|
-
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
|
14458
|
-
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
|
14459
|
-
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
|
14460
|
-
common_prefix_length = j;
|
|
14461
|
-
break;
|
|
14462
|
-
}
|
|
14463
|
-
}
|
|
14464
|
-
}
|
|
14465
|
-
return common_prefix_length;
|
|
14466
|
-
}
|
|
14467
|
-
|
|
14468
|
-
// Construct beams_state to send back to caller via the callback function.
|
|
14469
|
-
// Side effect: set common_prefix_length = find_common_prefix_length();
|
|
14470
|
-
llama_beams_state get_beams_state(const bool last_call) {
|
|
14471
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
14472
|
-
beam_views[i] = beams[i].view();
|
|
14473
|
-
}
|
|
14474
|
-
common_prefix_length = find_common_prefix_length();
|
|
14475
|
-
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
|
14476
|
-
}
|
|
14477
|
-
|
|
14478
|
-
// Loop:
|
|
14479
|
-
// * while i < n_predict, AND
|
|
14480
|
-
// * any of the beams have not yet reached end-of-beam (eob), AND
|
|
14481
|
-
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
|
14482
|
-
// (since all other beam probabilities can only decrease)
|
|
14483
|
-
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
|
14484
|
-
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
|
14485
|
-
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
|
14486
|
-
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
|
14487
|
-
!beams[top_beam_index()].eob ; ++i) {
|
|
14488
|
-
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
|
14489
|
-
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
|
14490
|
-
if (common_prefix_length) {
|
|
14491
|
-
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
|
14492
|
-
n_past += common_prefix_length;
|
|
14493
|
-
}
|
|
14494
|
-
// Zero-out next_beam probabilities to place them last in following min-heap.
|
|
14495
|
-
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
|
14496
|
-
for (llama_beam & beam : beams) {
|
|
14497
|
-
beam.shift_tokens(common_prefix_length);
|
|
14498
|
-
fill_next_beams_by_top_probabilities(beam);
|
|
14499
|
-
}
|
|
14500
|
-
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
|
14501
|
-
beams.swap(next_beams);
|
|
14502
|
-
renormalize_beam_probabilities(beams);
|
|
14503
|
-
}
|
|
14504
|
-
collapse_beams(top_beam_index());
|
|
14505
|
-
callback(callback_data, get_beams_state(true));
|
|
14506
|
-
}
|
|
14507
|
-
|
|
14508
|
-
// As beams grow, the cumulative probabilities decrease.
|
|
14509
|
-
// Renormalize them to avoid floating point underflow.
|
|
14510
|
-
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
|
14511
|
-
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
|
14512
|
-
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
|
14513
|
-
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
|
14514
|
-
}
|
|
14515
|
-
|
|
14516
|
-
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
|
14517
|
-
size_t top_beam_index() {
|
|
14518
|
-
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
|
14519
|
-
}
|
|
14520
|
-
|
|
14521
|
-
// Copy (p,eob) for each beam which may have been changed by the callback.
|
|
14522
|
-
void update_beams_from_beam_views() {
|
|
14523
|
-
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
14524
|
-
beams[i].p = beam_views[i].p;
|
|
14525
|
-
beams[i].eob = beam_views[i].eob;
|
|
14526
|
-
}
|
|
14527
|
-
}
|
|
14528
|
-
};
|
|
14529
|
-
|
|
14530
|
-
void llama_beam_search(llama_context * ctx,
|
|
14531
|
-
llama_beam_search_callback_fn_t callback, void * callback_data,
|
|
14532
|
-
size_t n_beams, int n_past, int n_predict) {
|
|
14533
|
-
assert(ctx);
|
|
14534
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
|
14535
|
-
|
|
14536
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
|
14537
|
-
|
|
14538
|
-
beam_search_data.loop(callback, callback_data);
|
|
14539
|
-
|
|
14540
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
14541
|
-
ctx->n_sample++;
|
|
14542
|
-
}
|
|
14543
|
-
|
|
14544
14706
|
//
|
|
14545
14707
|
// quantization
|
|
14546
14708
|
//
|
|
@@ -15751,7 +15913,7 @@ bool llama_supports_mlock(void) {
|
|
|
15751
15913
|
}
|
|
15752
15914
|
|
|
15753
15915
|
bool llama_supports_gpu_offload(void) {
|
|
15754
|
-
#if defined(GGML_USE_CUDA) || defined(
|
|
15916
|
+
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
|
15755
15917
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
|
15756
15918
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
15757
15919
|
return true;
|
|
@@ -15808,7 +15970,7 @@ struct llama_model * llama_load_model_from_file(
|
|
|
15808
15970
|
return true;
|
|
15809
15971
|
};
|
|
15810
15972
|
}
|
|
15811
|
-
if (params.rpc_servers != nullptr) {
|
|
15973
|
+
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
|
15812
15974
|
// split the servers set them into model->rpc_servers
|
|
15813
15975
|
std::string servers(params.rpc_servers);
|
|
15814
15976
|
size_t pos = 0;
|
|
@@ -15862,6 +16024,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15862
16024
|
params.flash_attn = false;
|
|
15863
16025
|
}
|
|
15864
16026
|
|
|
16027
|
+
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
|
16028
|
+
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
|
16029
|
+
return nullptr;
|
|
16030
|
+
}
|
|
16031
|
+
|
|
15865
16032
|
llama_context * ctx = new llama_context(*model);
|
|
15866
16033
|
|
|
15867
16034
|
const auto & hparams = model->hparams;
|
|
@@ -15900,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15900
16067
|
|
|
15901
16068
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
15902
16069
|
|
|
15903
|
-
cparams.
|
|
15904
|
-
hparams.
|
|
16070
|
+
cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
|
16071
|
+
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
|
|
15905
16072
|
hparams.n_ctx_train;
|
|
15906
16073
|
|
|
15907
16074
|
cparams.cb_eval = params.cb_eval;
|
|
@@ -15966,17 +16133,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15966
16133
|
|
|
15967
16134
|
if (!hparams.vocab_only) {
|
|
15968
16135
|
// initialize backends
|
|
15969
|
-
#if defined(
|
|
15970
|
-
for (auto & server : model->rpc_servers) {
|
|
15971
|
-
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
15972
|
-
if (backend == nullptr) {
|
|
15973
|
-
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
15974
|
-
llama_free(ctx);
|
|
15975
|
-
return nullptr;
|
|
15976
|
-
}
|
|
15977
|
-
ctx->backends.push_back(backend);
|
|
15978
|
-
}
|
|
15979
|
-
#elif defined(GGML_USE_METAL)
|
|
16136
|
+
#if defined(GGML_USE_METAL)
|
|
15980
16137
|
if (model->n_gpu_layers > 0) {
|
|
15981
16138
|
ctx->backend_metal = ggml_backend_metal_init();
|
|
15982
16139
|
if (ctx->backend_metal == nullptr) {
|
|
@@ -16015,7 +16172,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16015
16172
|
return nullptr;
|
|
16016
16173
|
}
|
|
16017
16174
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
16018
|
-
ggml_backend_t backend = ggml_backend_vk_init(
|
|
16175
|
+
ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
|
|
16019
16176
|
if (backend == nullptr) {
|
|
16020
16177
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
|
16021
16178
|
llama_free(ctx);
|
|
@@ -16068,6 +16225,19 @@ struct llama_context * llama_new_context_with_model(
|
|
|
16068
16225
|
}
|
|
16069
16226
|
ctx->backends.push_back(backend);
|
|
16070
16227
|
}
|
|
16228
|
+
#endif
|
|
16229
|
+
#if defined(GGML_USE_RPC)
|
|
16230
|
+
if (model->n_gpu_layers > 0) {
|
|
16231
|
+
for (const auto & endpoint : model->rpc_servers) {
|
|
16232
|
+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
|
16233
|
+
if (backend == nullptr) {
|
|
16234
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
|
16235
|
+
llama_free(ctx);
|
|
16236
|
+
return nullptr;
|
|
16237
|
+
}
|
|
16238
|
+
ctx->backends.push_back(backend);
|
|
16239
|
+
}
|
|
16240
|
+
}
|
|
16071
16241
|
#endif
|
|
16072
16242
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
|
16073
16243
|
if (ctx->backend_cpu == nullptr) {
|
|
@@ -16235,6 +16405,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
16235
16405
|
case LLM_ARCH_COMMAND_R:
|
|
16236
16406
|
case LLM_ARCH_OLMO:
|
|
16237
16407
|
case LLM_ARCH_ARCTIC:
|
|
16408
|
+
case LLM_ARCH_DEEPSEEK2:
|
|
16238
16409
|
return LLAMA_ROPE_TYPE_NORM;
|
|
16239
16410
|
|
|
16240
16411
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -17849,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
|
|
17849
18020
|
return model->vocab.id_to_token[token].score;
|
|
17850
18021
|
}
|
|
17851
18022
|
|
|
17852
|
-
|
|
18023
|
+
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
|
17853
18024
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
17854
|
-
return model->vocab.id_to_token[token].
|
|
18025
|
+
return model->vocab.id_to_token[token].attr;
|
|
17855
18026
|
}
|
|
17856
18027
|
|
|
17857
18028
|
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
@@ -17861,6 +18032,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
|
17861
18032
|
);
|
|
17862
18033
|
}
|
|
17863
18034
|
|
|
18035
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
|
18036
|
+
return llama_is_control_token(model->vocab, token);
|
|
18037
|
+
}
|
|
18038
|
+
|
|
17864
18039
|
llama_token llama_token_bos(const struct llama_model * model) {
|
|
17865
18040
|
return model->vocab.special_bos_id;
|
|
17866
18041
|
}
|
|
@@ -17932,7 +18107,16 @@ static std::string llama_decode_text(const std::string & text) {
|
|
|
17932
18107
|
|
|
17933
18108
|
const auto cpts = unicode_cpts_from_utf8(text);
|
|
17934
18109
|
for (const auto cpt : cpts) {
|
|
17935
|
-
|
|
18110
|
+
const auto utf8 = unicode_cpt_to_utf8(cpt);
|
|
18111
|
+
try {
|
|
18112
|
+
decoded_text += unicode_utf8_to_byte(utf8);
|
|
18113
|
+
} catch (const std::out_of_range & e) {
|
|
18114
|
+
decoded_text += "[UNK_BYTE_0x";
|
|
18115
|
+
for (const auto c : utf8) {
|
|
18116
|
+
decoded_text += format("%02x", (uint8_t) c);
|
|
18117
|
+
}
|
|
18118
|
+
decoded_text += text + "]";
|
|
18119
|
+
}
|
|
17936
18120
|
}
|
|
17937
18121
|
|
|
17938
18122
|
return decoded_text;
|
|
@@ -17940,69 +18124,88 @@ static std::string llama_decode_text(const std::string & text) {
|
|
|
17940
18124
|
|
|
17941
18125
|
// does not write null-terminator to buf
|
|
17942
18126
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
|
18127
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
|
18128
|
+
if (!special && llama_is_control_token(model->vocab, token)) {
|
|
18129
|
+
return 0;
|
|
18130
|
+
}
|
|
18131
|
+
|
|
18132
|
+
// if we have a cache - use it
|
|
18133
|
+
{
|
|
18134
|
+
const auto & cache = model->vocab.cache_token_to_piece;
|
|
18135
|
+
|
|
18136
|
+
if (!cache.empty()) {
|
|
18137
|
+
const auto & res = cache.at(token);
|
|
18138
|
+
if (length < (int) res.size()) {
|
|
18139
|
+
return -(int) res.size();
|
|
18140
|
+
}
|
|
18141
|
+
memcpy(buf, res.c_str(), res.size());
|
|
18142
|
+
return res.size();
|
|
18143
|
+
}
|
|
18144
|
+
}
|
|
18145
|
+
|
|
17943
18146
|
if (0 <= token && token < llama_n_vocab(model)) {
|
|
17944
18147
|
switch (llama_vocab_get_type(model->vocab)) {
|
|
17945
|
-
|
|
17946
|
-
|
|
17947
|
-
|
|
17948
|
-
|
|
17949
|
-
|
|
17950
|
-
|
|
17951
|
-
|
|
17952
|
-
|
|
17953
|
-
|
|
17954
|
-
|
|
17955
|
-
|
|
17956
|
-
|
|
17957
|
-
|
|
17958
|
-
|
|
17959
|
-
|
|
17960
|
-
|
|
17961
|
-
|
|
17962
|
-
|
|
17963
|
-
|
|
17964
|
-
|
|
17965
|
-
|
|
17966
|
-
|
|
17967
|
-
|
|
17968
|
-
|
|
17969
|
-
|
|
17970
|
-
|
|
17971
|
-
|
|
17972
|
-
|
|
17973
|
-
|
|
17974
|
-
|
|
18148
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
|
18149
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
|
18150
|
+
// NOTE: we accept all unsupported token types,
|
|
18151
|
+
// suppressing them like CONTROL tokens.
|
|
18152
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
|
18153
|
+
std::string result = model->vocab.id_to_token[token].text;
|
|
18154
|
+
llama_unescape_whitespace(result);
|
|
18155
|
+
if (length < (int) result.length()) {
|
|
18156
|
+
return -(int) result.length();
|
|
18157
|
+
}
|
|
18158
|
+
memcpy(buf, result.c_str(), result.length());
|
|
18159
|
+
return result.length();
|
|
18160
|
+
} else if (
|
|
18161
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
|
18162
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
|
18163
|
+
std::string result = model->vocab.id_to_token[token].text;
|
|
18164
|
+
if (length < (int) result.length()) {
|
|
18165
|
+
return -(int) result.length();
|
|
18166
|
+
}
|
|
18167
|
+
memcpy(buf, result.c_str(), result.length());
|
|
18168
|
+
return result.length();
|
|
18169
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
|
18170
|
+
if (length < 3) {
|
|
18171
|
+
return -3;
|
|
18172
|
+
}
|
|
18173
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
|
18174
|
+
return 3;
|
|
18175
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
|
18176
|
+
if (length < 1) {
|
|
18177
|
+
return -1;
|
|
18178
|
+
}
|
|
18179
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
|
18180
|
+
return 1;
|
|
17975
18181
|
}
|
|
17976
|
-
|
|
17977
|
-
return 1;
|
|
18182
|
+
break;
|
|
17978
18183
|
}
|
|
17979
|
-
|
|
17980
|
-
|
|
17981
|
-
|
|
17982
|
-
|
|
17983
|
-
|
|
17984
|
-
|
|
17985
|
-
|
|
17986
|
-
|
|
17987
|
-
|
|
17988
|
-
|
|
17989
|
-
|
|
17990
|
-
|
|
17991
|
-
|
|
17992
|
-
|
|
17993
|
-
|
|
17994
|
-
(
|
|
17995
|
-
|
|
17996
|
-
|
|
17997
|
-
|
|
18184
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
|
18185
|
+
// NOTE: we accept all unsupported token types,
|
|
18186
|
+
// suppressing them like CONTROL tokens.
|
|
18187
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
|
18188
|
+
std::string result = model->vocab.id_to_token[token].text;
|
|
18189
|
+
result = llama_decode_text(result);
|
|
18190
|
+
if (length < (int) result.length()) {
|
|
18191
|
+
return -(int) result.length();
|
|
18192
|
+
}
|
|
18193
|
+
memcpy(buf, result.c_str(), result.length());
|
|
18194
|
+
return result.length();
|
|
18195
|
+
} else if (
|
|
18196
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
|
18197
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
|
18198
|
+
std::string result = model->vocab.id_to_token[token].text;
|
|
18199
|
+
if (length < (int) result.length()) {
|
|
18200
|
+
return -(int) result.length();
|
|
18201
|
+
}
|
|
18202
|
+
memcpy(buf, result.c_str(), result.length());
|
|
18203
|
+
return result.length();
|
|
17998
18204
|
}
|
|
17999
|
-
|
|
18000
|
-
return result.length();
|
|
18205
|
+
break;
|
|
18001
18206
|
}
|
|
18002
|
-
|
|
18003
|
-
|
|
18004
|
-
default:
|
|
18005
|
-
GGML_ASSERT(false);
|
|
18207
|
+
default:
|
|
18208
|
+
GGML_ASSERT(false);
|
|
18006
18209
|
}
|
|
18007
18210
|
}
|
|
18008
18211
|
return 0;
|
|
@@ -18337,6 +18540,7 @@ const char * llama_print_system_info(void) {
|
|
|
18337
18540
|
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
|
18338
18541
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
|
18339
18542
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
18543
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
|
18340
18544
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
|
18341
18545
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
|
18342
18546
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|