llama_cpp 0.9.2 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,6 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
+
#define LLAMA_MAX_NODES 4096
|
95
|
+
|
94
96
|
//
|
95
97
|
// logging
|
96
98
|
//
|
@@ -190,6 +192,7 @@ enum llm_arch {
|
|
190
192
|
LLM_ARCH_PERSIMMON,
|
191
193
|
LLM_ARCH_REFACT,
|
192
194
|
LLM_ARCH_BLOOM,
|
195
|
+
LLM_ARCH_STABLELM,
|
193
196
|
LLM_ARCH_UNKNOWN,
|
194
197
|
};
|
195
198
|
|
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
205
208
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
209
|
{ LLM_ARCH_REFACT, "refact" },
|
207
210
|
{ LLM_ARCH_BLOOM, "bloom" },
|
211
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
208
212
|
};
|
209
213
|
|
210
214
|
enum llm_kv {
|
@@ -251,6 +255,8 @@ enum llm_kv {
|
|
251
255
|
LLM_KV_TOKENIZER_UNK_ID,
|
252
256
|
LLM_KV_TOKENIZER_SEP_ID,
|
253
257
|
LLM_KV_TOKENIZER_PAD_ID,
|
258
|
+
LLM_KV_TOKENIZER_ADD_BOS,
|
259
|
+
LLM_KV_TOKENIZER_ADD_EOS,
|
254
260
|
LLM_KV_TOKENIZER_HF_JSON,
|
255
261
|
LLM_KV_TOKENIZER_RWKV,
|
256
262
|
};
|
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
299
305
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
300
306
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
301
307
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
308
|
+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
309
|
+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
302
310
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
303
311
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
304
312
|
};
|
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
493
501
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
494
502
|
},
|
495
503
|
},
|
504
|
+
{
|
505
|
+
LLM_ARCH_STABLELM,
|
506
|
+
{
|
507
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
508
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
509
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
510
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
511
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
512
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
513
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
514
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
515
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
516
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
517
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
518
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
519
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
520
|
+
},
|
521
|
+
},
|
522
|
+
|
496
523
|
{
|
497
524
|
LLM_ARCH_UNKNOWN,
|
498
525
|
{
|
@@ -1055,9 +1082,9 @@ enum e_model {
|
|
1055
1082
|
MODEL_70B,
|
1056
1083
|
};
|
1057
1084
|
|
1058
|
-
static const size_t
|
1059
|
-
static const size_t
|
1060
|
-
static const size_t
|
1085
|
+
static const size_t kiB = 1024;
|
1086
|
+
static const size_t MiB = 1024*kiB;
|
1087
|
+
static const size_t GiB = 1024*MiB;
|
1061
1088
|
|
1062
1089
|
struct llama_hparams {
|
1063
1090
|
bool vocab_only;
|
@@ -1248,6 +1275,9 @@ struct llama_vocab {
|
|
1248
1275
|
id special_sep_id = -1;
|
1249
1276
|
id special_pad_id = -1;
|
1250
1277
|
|
1278
|
+
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
1279
|
+
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
1280
|
+
|
1251
1281
|
id linefeed_id = 13;
|
1252
1282
|
id special_prefix_id = 32007;
|
1253
1283
|
id special_middle_id = 32009;
|
@@ -1453,7 +1483,7 @@ static bool llama_kv_cache_init(
|
|
1453
1483
|
vram_kv_cache += ggml_nbytes(cache.k);
|
1454
1484
|
}
|
1455
1485
|
if (vram_kv_cache > 0) {
|
1456
|
-
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f
|
1486
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1457
1487
|
}
|
1458
1488
|
}
|
1459
1489
|
#endif
|
@@ -2209,6 +2239,16 @@ static void llm_load_hparams(
|
|
2209
2239
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2210
2240
|
}
|
2211
2241
|
} break;
|
2242
|
+
case LLM_ARCH_STABLELM:
|
2243
|
+
{
|
2244
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2245
|
+
|
2246
|
+
switch (hparams.n_layer) {
|
2247
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2248
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2249
|
+
}
|
2250
|
+
} break;
|
2251
|
+
|
2212
2252
|
default: (void)0;
|
2213
2253
|
}
|
2214
2254
|
|
@@ -2350,6 +2390,23 @@ static void llm_load_vocab(
|
|
2350
2390
|
__func__, key.c_str(), id, old_id);
|
2351
2391
|
id = old_id;
|
2352
2392
|
}
|
2393
|
+
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
// Handle add_bos_token and add_eos_token
|
2397
|
+
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
2398
|
+
int kid = gguf_find_key(ctx, key.c_str());
|
2399
|
+
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2400
|
+
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2401
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2402
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2403
|
+
}
|
2404
|
+
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
2405
|
+
kid = gguf_find_key(ctx, key.c_str());
|
2406
|
+
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2407
|
+
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2408
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2409
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2353
2410
|
}
|
2354
2411
|
}
|
2355
2412
|
|
@@ -2481,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2481
2538
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2482
2539
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2483
2540
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2484
|
-
if (ml.n_bytes <
|
2485
|
-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,
|
2541
|
+
if (ml.n_bytes < GiB) {
|
2542
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2486
2543
|
} else {
|
2487
2544
|
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2488
2545
|
}
|
@@ -2520,7 +2577,7 @@ static void llm_load_tensors(
|
|
2520
2577
|
|
2521
2578
|
ml.calc_sizes(ctx_size, mmapped_size);
|
2522
2579
|
|
2523
|
-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f
|
2580
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2524
2581
|
|
2525
2582
|
// create the ggml context
|
2526
2583
|
{
|
@@ -2872,6 +2929,13 @@ static void llm_load_tensors(
|
|
2872
2929
|
ggml_backend_type backend_output;
|
2873
2930
|
|
2874
2931
|
if (n_gpu_layers > int(n_layer)) {
|
2932
|
+
#ifdef GGML_USE_CUBLAS
|
2933
|
+
if (n_gpu_layers > int(n_layer + 1)) {
|
2934
|
+
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
2935
|
+
__func__, n_layer + 1);
|
2936
|
+
throw std::runtime_error("Persimmon CUDA offload failed");
|
2937
|
+
}
|
2938
|
+
#endif
|
2875
2939
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2876
2940
|
// on Windows however this is detrimental unless everything is on the GPU
|
2877
2941
|
#ifndef _WIN32
|
@@ -3073,6 +3137,81 @@ static void llm_load_tensors(
|
|
3073
3137
|
}
|
3074
3138
|
}
|
3075
3139
|
} break;
|
3140
|
+
case LLM_ARCH_STABLELM:
|
3141
|
+
{
|
3142
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3143
|
+
|
3144
|
+
// output
|
3145
|
+
{
|
3146
|
+
ggml_backend_type backend_norm;
|
3147
|
+
ggml_backend_type backend_output;
|
3148
|
+
|
3149
|
+
if (n_gpu_layers > int(n_layer)) {
|
3150
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3151
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
3152
|
+
#ifndef _WIN32
|
3153
|
+
backend_norm = llama_backend_offload;
|
3154
|
+
#else
|
3155
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3156
|
+
#endif // _WIN32
|
3157
|
+
|
3158
|
+
backend_output = llama_backend_offload_split;
|
3159
|
+
} else {
|
3160
|
+
backend_norm = GGML_BACKEND_CPU;
|
3161
|
+
backend_output = GGML_BACKEND_CPU;
|
3162
|
+
}
|
3163
|
+
|
3164
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3165
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3166
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3167
|
+
|
3168
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3169
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3170
|
+
}
|
3171
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3172
|
+
vram_weights += ggml_nbytes(model.output);
|
3173
|
+
}
|
3174
|
+
}
|
3175
|
+
|
3176
|
+
const uint32_t n_ff = hparams.n_ff;
|
3177
|
+
|
3178
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3179
|
+
|
3180
|
+
model.layers.resize(n_layer);
|
3181
|
+
|
3182
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3183
|
+
/*
|
3184
|
+
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
3185
|
+
*/
|
3186
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3187
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3188
|
+
|
3189
|
+
auto & layer = model.layers[i];
|
3190
|
+
|
3191
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3192
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3193
|
+
|
3194
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
3195
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3196
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3197
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3198
|
+
|
3199
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3200
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
3201
|
+
|
3202
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3203
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3204
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3205
|
+
|
3206
|
+
if (backend == GGML_BACKEND_GPU) {
|
3207
|
+
vram_weights +=
|
3208
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3209
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3210
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3211
|
+
}
|
3212
|
+
}
|
3213
|
+
} break;
|
3214
|
+
|
3076
3215
|
default:
|
3077
3216
|
throw std::runtime_error("unknown architecture");
|
3078
3217
|
}
|
@@ -3087,7 +3226,7 @@ static void llm_load_tensors(
|
|
3087
3226
|
ctx_size +
|
3088
3227
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3089
3228
|
|
3090
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f
|
3229
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
3091
3230
|
|
3092
3231
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3093
3232
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -3106,7 +3245,7 @@ static void llm_load_tensors(
|
|
3106
3245
|
#endif // GGML_USE_CUBLAS
|
3107
3246
|
|
3108
3247
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3109
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f
|
3248
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3110
3249
|
#else
|
3111
3250
|
(void) n_gpu_layers;
|
3112
3251
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -3606,7 +3745,7 @@ struct llm_build_context {
|
|
3606
3745
|
}
|
3607
3746
|
|
3608
3747
|
struct ggml_cgraph * build_llama() {
|
3609
|
-
struct ggml_cgraph * gf =
|
3748
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3610
3749
|
|
3611
3750
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3612
3751
|
|
@@ -3718,7 +3857,7 @@ struct llm_build_context {
|
|
3718
3857
|
}
|
3719
3858
|
|
3720
3859
|
struct ggml_cgraph * build_baichuan() {
|
3721
|
-
struct ggml_cgraph * gf =
|
3860
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3722
3861
|
|
3723
3862
|
struct ggml_tensor * cur;
|
3724
3863
|
struct ggml_tensor * inpL;
|
@@ -3838,7 +3977,7 @@ struct llm_build_context {
|
|
3838
3977
|
}
|
3839
3978
|
|
3840
3979
|
struct ggml_cgraph * build_falcon() {
|
3841
|
-
struct ggml_cgraph * gf =
|
3980
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3842
3981
|
|
3843
3982
|
struct ggml_tensor * cur;
|
3844
3983
|
struct ggml_tensor * inpL;
|
@@ -3960,7 +4099,7 @@ struct llm_build_context {
|
|
3960
4099
|
}
|
3961
4100
|
|
3962
4101
|
struct ggml_cgraph * build_starcoder() {
|
3963
|
-
struct ggml_cgraph * gf =
|
4102
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3964
4103
|
|
3965
4104
|
struct ggml_tensor * cur;
|
3966
4105
|
struct ggml_tensor * pos;
|
@@ -4059,7 +4198,7 @@ struct llm_build_context {
|
|
4059
4198
|
}
|
4060
4199
|
|
4061
4200
|
struct ggml_cgraph * build_persimmon() {
|
4062
|
-
struct ggml_cgraph * gf =
|
4201
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4063
4202
|
|
4064
4203
|
const int64_t n_rot = n_embd_head / 2;
|
4065
4204
|
|
@@ -4204,7 +4343,7 @@ struct llm_build_context {
|
|
4204
4343
|
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4205
4344
|
cb(Kcur, "Kcur", il);
|
4206
4345
|
|
4207
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur,
|
4346
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4208
4347
|
cb(Q, "Q", il);
|
4209
4348
|
|
4210
4349
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
@@ -4269,7 +4408,7 @@ struct llm_build_context {
|
|
4269
4408
|
}
|
4270
4409
|
|
4271
4410
|
struct ggml_cgraph * build_refact() {
|
4272
|
-
struct ggml_cgraph * gf =
|
4411
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4273
4412
|
|
4274
4413
|
struct ggml_tensor * cur;
|
4275
4414
|
struct ggml_tensor * inpL;
|
@@ -4360,7 +4499,7 @@ struct llm_build_context {
|
|
4360
4499
|
}
|
4361
4500
|
|
4362
4501
|
struct ggml_cgraph * build_bloom() {
|
4363
|
-
struct ggml_cgraph * gf =
|
4502
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4364
4503
|
|
4365
4504
|
struct ggml_tensor * cur;
|
4366
4505
|
struct ggml_tensor * inpL;
|
@@ -4454,7 +4593,7 @@ struct llm_build_context {
|
|
4454
4593
|
}
|
4455
4594
|
|
4456
4595
|
struct ggml_cgraph * build_mpt() {
|
4457
|
-
struct ggml_cgraph * gf =
|
4596
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4458
4597
|
|
4459
4598
|
struct ggml_tensor * cur;
|
4460
4599
|
struct ggml_tensor * inpL;
|
@@ -4551,6 +4690,177 @@ struct llm_build_context {
|
|
4551
4690
|
|
4552
4691
|
return gf;
|
4553
4692
|
}
|
4693
|
+
|
4694
|
+
struct ggml_cgraph * build_stablelm() {
|
4695
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4696
|
+
|
4697
|
+
struct ggml_tensor * cur;
|
4698
|
+
struct ggml_tensor * inpL;
|
4699
|
+
|
4700
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4701
|
+
cb(inpL, "inp_embd", -1);
|
4702
|
+
|
4703
|
+
// inp_pos - contains the positions
|
4704
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4705
|
+
cb(inp_pos, "inp_pos", -1);
|
4706
|
+
|
4707
|
+
// KQ_scale
|
4708
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4709
|
+
cb(KQ_scale, "KQ_scale", -1);
|
4710
|
+
|
4711
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4712
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4713
|
+
cb(KQ_mask, "KQ_mask", -1);
|
4714
|
+
|
4715
|
+
// shift the entire K-cache if needed
|
4716
|
+
if (do_rope_shift) {
|
4717
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
|
4718
|
+
}
|
4719
|
+
|
4720
|
+
for (int il = 0; il < n_layer; ++il) {
|
4721
|
+
struct ggml_tensor * inpSA = inpL;
|
4722
|
+
|
4723
|
+
// norm
|
4724
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
4725
|
+
model.layers[il].attn_norm,
|
4726
|
+
model.layers[il].attn_norm_b,
|
4727
|
+
LLM_NORM, cb, il);
|
4728
|
+
cb(cur, "attn_norm", il);
|
4729
|
+
|
4730
|
+
// self-attention
|
4731
|
+
{
|
4732
|
+
// compute Q and K and RoPE them
|
4733
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4734
|
+
cb(tmpq, "tmpq", il);
|
4735
|
+
|
4736
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4737
|
+
cb(tmpk, "tmpk", il);
|
4738
|
+
|
4739
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
|
+
cb(Vcur, "Vcur", il);
|
4741
|
+
|
4742
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4743
|
+
struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
|
4744
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
4745
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4746
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
+
0
|
4748
|
+
));
|
4749
|
+
cb(qrot, "qrot", il);
|
4750
|
+
|
4751
|
+
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
+
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
+
0
|
4756
|
+
));
|
4757
|
+
cb(krot, "krot", il);
|
4758
|
+
|
4759
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
+
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
+
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
+
);
|
4766
|
+
cb(qpass, "qpass", il);
|
4767
|
+
|
4768
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
+
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
+
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
+
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
+
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
+
);
|
4774
|
+
cb(kpass, "kpass", il);
|
4775
|
+
|
4776
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
+
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
+
);
|
4780
|
+
cb(qrotated, "qrotated", il);
|
4781
|
+
|
4782
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
+
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4785
|
+
);
|
4786
|
+
cb(krotated, "krotated", il);
|
4787
|
+
|
4788
|
+
// ggml currently only supports concatenation on dim=2
|
4789
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
+
cb(qrotated, "qrotated", il);
|
4792
|
+
|
4793
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
+
cb(krotated, "krotated", il);
|
4795
|
+
|
4796
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
+
cb(qpass, "qpass", il);
|
4798
|
+
|
4799
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
+
cb(kpass, "kpass", il);
|
4801
|
+
|
4802
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
|
+
cb(Qcur, "Qcur", il);
|
4804
|
+
|
4805
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4806
|
+
cb(Kcur, "Kcur", il);
|
4807
|
+
|
4808
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4809
|
+
cb(Q, "Q", il);
|
4810
|
+
|
4811
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4812
|
+
cb(Kcur, "Kcur", il);
|
4813
|
+
|
4814
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
|
+
|
4816
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
|
+
model.layers[il].wo, NULL,
|
4818
|
+
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
|
+
cb(cur, "kqv_out", il);
|
4820
|
+
}
|
4821
|
+
|
4822
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
4823
|
+
cb(ffn_inp, "ffn_inp", il);
|
4824
|
+
|
4825
|
+
// feed-forward network
|
4826
|
+
{
|
4827
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4828
|
+
model.layers[il].ffn_norm,
|
4829
|
+
model.layers[il].ffn_norm_b,
|
4830
|
+
LLM_NORM, cb, il);
|
4831
|
+
cb(cur, "ffn_norm", il);
|
4832
|
+
|
4833
|
+
cur = llm_build_ffn(ctx0, cur,
|
4834
|
+
model.layers[il].ffn_up, NULL,
|
4835
|
+
model.layers[il].ffn_gate, NULL,
|
4836
|
+
model.layers[il].ffn_down, NULL,
|
4837
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4838
|
+
cb(cur, "ffn_out", il);
|
4839
|
+
}
|
4840
|
+
|
4841
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
4842
|
+
cb(cur, "l_out", il);
|
4843
|
+
|
4844
|
+
// input for next layer
|
4845
|
+
inpL = cur;
|
4846
|
+
}
|
4847
|
+
|
4848
|
+
cur = inpL;
|
4849
|
+
|
4850
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
4851
|
+
model.output_norm,
|
4852
|
+
model.output_norm_b,
|
4853
|
+
LLM_NORM, cb, -1);
|
4854
|
+
cb(cur, "result_norm", -1);
|
4855
|
+
|
4856
|
+
// lm_head
|
4857
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4858
|
+
cb(cur, "result_output", -1);
|
4859
|
+
|
4860
|
+
ggml_build_forward_expand(gf, cur);
|
4861
|
+
|
4862
|
+
return gf;
|
4863
|
+
}
|
4554
4864
|
};
|
4555
4865
|
|
4556
4866
|
//
|
@@ -5020,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5020
5330
|
{
|
5021
5331
|
result = llm.build_mpt();
|
5022
5332
|
} break;
|
5333
|
+
case LLM_ARCH_STABLELM:
|
5334
|
+
{
|
5335
|
+
result = llm.build_stablelm();
|
5336
|
+
} break;
|
5023
5337
|
default:
|
5024
5338
|
GGML_ASSERT(false);
|
5025
5339
|
}
|
@@ -5195,7 +5509,8 @@ static int llama_decode_internal(
|
|
5195
5509
|
model.arch == LLM_ARCH_FALCON ||
|
5196
5510
|
model.arch == LLM_ARCH_REFACT ||
|
5197
5511
|
model.arch == LLM_ARCH_MPT ||
|
5198
|
-
model.arch == LLM_ARCH_STARCODER
|
5512
|
+
model.arch == LLM_ARCH_STARCODER ||
|
5513
|
+
model.arch == LLM_ARCH_STABLELM;
|
5199
5514
|
|
5200
5515
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5201
5516
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -5987,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5987
6302
|
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
5988
6303
|
// and passing 'add space prefix' as bool argument
|
5989
6304
|
//
|
5990
|
-
auto raw_text =
|
6305
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6306
|
+
if (&fragment == &fragment_buffer.front()) {
|
6307
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
6308
|
+
}
|
5991
6309
|
|
5992
6310
|
#ifdef PRETOKENIZERDEBUG
|
5993
6311
|
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
@@ -7639,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7639
7957
|
workers.clear();
|
7640
7958
|
}
|
7641
7959
|
|
7642
|
-
LLAMA_LOG_INFO("size = %8.2f
|
7960
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
7643
7961
|
int64_t tot_count = 0;
|
7644
7962
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
7645
7963
|
hist_all[i] += hist_cur[i];
|
@@ -8179,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8179
8497
|
|
8180
8498
|
{
|
8181
8499
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
8182
|
-
LLAMA_LOG_INFO("%s: kv self size = %7.2f
|
8500
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
8183
8501
|
}
|
8184
8502
|
|
8185
8503
|
// resized during inference
|
@@ -8196,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8196
8514
|
{
|
8197
8515
|
static const size_t tensor_alignment = 32;
|
8198
8516
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
8199
|
-
ctx->buf_compute.resize(ggml_tensor_overhead()*
|
8517
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
8200
8518
|
|
8201
8519
|
// create measure allocator
|
8202
8520
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
@@ -8224,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8224
8542
|
// measure memory requirements for the graph
|
8225
8543
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
8226
8544
|
|
8227
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f
|
8545
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
8228
8546
|
|
8229
8547
|
// recreate allocator with exact memory requirements
|
8230
8548
|
ggml_allocr_free(ctx->alloc);
|
@@ -8238,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8238
8556
|
#endif
|
8239
8557
|
#ifdef GGML_USE_CUBLAS
|
8240
8558
|
ggml_cuda_set_scratch_size(alloc_size);
|
8241
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f
|
8559
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
8242
8560
|
|
8243
8561
|
// calculate total VRAM usage
|
8244
8562
|
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
@@ -8258,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
|
|
8258
8576
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8259
8577
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
8260
8578
|
|
8261
|
-
LLAMA_LOG_INFO("%s: total VRAM used: %.2f
|
8579
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
8262
8580
|
total_vram_size / 1024.0 / 1024.0,
|
8263
8581
|
model_vram_size / 1024.0 / 1024.0,
|
8264
|
-
ctx_vram_size
|
8582
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
8265
8583
|
#endif
|
8266
8584
|
}
|
8267
8585
|
|
@@ -8282,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8282
8600
|
|
8283
8601
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
8284
8602
|
|
8285
|
-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f
|
8603
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
8286
8604
|
|
8287
8605
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
8288
8606
|
if (!(result)) { \
|
@@ -8585,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8585
8903
|
if (kv_buf_size) {
|
8586
8904
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8587
8905
|
|
8588
|
-
ggml_context * cpy_ctx = ggml_init({
|
8589
|
-
ggml_cgraph gf
|
8906
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
8907
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8590
8908
|
|
8591
8909
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8592
8910
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
@@ -8604,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8604
8922
|
kv_head, n_embd, n_layer,
|
8605
8923
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8606
8924
|
|
8607
|
-
ggml_build_forward_expand(
|
8608
|
-
ggml_build_forward_expand(
|
8609
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
8925
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
8926
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
8927
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8610
8928
|
|
8611
8929
|
ggml_free(cpy_ctx);
|
8612
8930
|
|
@@ -8713,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8713
9031
|
|
8714
9032
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8715
9033
|
|
8716
|
-
ggml_context * cpy_ctx = ggml_init({
|
8717
|
-
ggml_cgraph gf
|
9034
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9035
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8718
9036
|
|
8719
9037
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8720
9038
|
kin3d->data = (void *) inp;
|
@@ -8732,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8732
9050
|
kv_head, n_embd, n_layer,
|
8733
9051
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8734
9052
|
|
8735
|
-
ggml_build_forward_expand(
|
8736
|
-
ggml_build_forward_expand(
|
8737
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9053
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9054
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9055
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8738
9056
|
|
8739
9057
|
ggml_free(cpy_ctx);
|
8740
9058
|
}
|
@@ -8989,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
8989
9307
|
return model->vocab.linefeed_id;
|
8990
9308
|
}
|
8991
9309
|
|
9310
|
+
int llama_add_bos_token(const struct llama_model * model) {
|
9311
|
+
return model->vocab.special_add_bos;
|
9312
|
+
}
|
9313
|
+
|
9314
|
+
int llama_add_eos_token(const struct llama_model * model) {
|
9315
|
+
return model->vocab.special_add_eos;
|
9316
|
+
}
|
9317
|
+
|
8992
9318
|
llama_token llama_token_prefix(const struct llama_model * model) {
|
8993
9319
|
return model->vocab.special_prefix_id;
|
8994
9320
|
}
|