llama_cpp 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,6 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
+
#define LLAMA_MAX_NODES 4096
|
95
|
+
|
94
96
|
//
|
95
97
|
// logging
|
96
98
|
//
|
@@ -190,6 +192,7 @@ enum llm_arch {
|
|
190
192
|
LLM_ARCH_PERSIMMON,
|
191
193
|
LLM_ARCH_REFACT,
|
192
194
|
LLM_ARCH_BLOOM,
|
195
|
+
LLM_ARCH_STABLELM,
|
193
196
|
LLM_ARCH_UNKNOWN,
|
194
197
|
};
|
195
198
|
|
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
205
208
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
209
|
{ LLM_ARCH_REFACT, "refact" },
|
207
210
|
{ LLM_ARCH_BLOOM, "bloom" },
|
211
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
208
212
|
};
|
209
213
|
|
210
214
|
enum llm_kv {
|
@@ -251,6 +255,8 @@ enum llm_kv {
|
|
251
255
|
LLM_KV_TOKENIZER_UNK_ID,
|
252
256
|
LLM_KV_TOKENIZER_SEP_ID,
|
253
257
|
LLM_KV_TOKENIZER_PAD_ID,
|
258
|
+
LLM_KV_TOKENIZER_ADD_BOS,
|
259
|
+
LLM_KV_TOKENIZER_ADD_EOS,
|
254
260
|
LLM_KV_TOKENIZER_HF_JSON,
|
255
261
|
LLM_KV_TOKENIZER_RWKV,
|
256
262
|
};
|
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
299
305
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
300
306
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
301
307
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
308
|
+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
309
|
+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
302
310
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
303
311
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
304
312
|
};
|
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
493
501
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
494
502
|
},
|
495
503
|
},
|
504
|
+
{
|
505
|
+
LLM_ARCH_STABLELM,
|
506
|
+
{
|
507
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
508
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
509
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
510
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
511
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
512
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
513
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
514
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
515
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
516
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
517
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
518
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
519
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
520
|
+
},
|
521
|
+
},
|
522
|
+
|
496
523
|
{
|
497
524
|
LLM_ARCH_UNKNOWN,
|
498
525
|
{
|
@@ -1055,9 +1082,9 @@ enum e_model {
|
|
1055
1082
|
MODEL_70B,
|
1056
1083
|
};
|
1057
1084
|
|
1058
|
-
static const size_t
|
1059
|
-
static const size_t
|
1060
|
-
static const size_t
|
1085
|
+
static const size_t kiB = 1024;
|
1086
|
+
static const size_t MiB = 1024*kiB;
|
1087
|
+
static const size_t GiB = 1024*MiB;
|
1061
1088
|
|
1062
1089
|
struct llama_hparams {
|
1063
1090
|
bool vocab_only;
|
@@ -1248,6 +1275,9 @@ struct llama_vocab {
|
|
1248
1275
|
id special_sep_id = -1;
|
1249
1276
|
id special_pad_id = -1;
|
1250
1277
|
|
1278
|
+
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
1279
|
+
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
1280
|
+
|
1251
1281
|
id linefeed_id = 13;
|
1252
1282
|
id special_prefix_id = 32007;
|
1253
1283
|
id special_middle_id = 32009;
|
@@ -1453,7 +1483,7 @@ static bool llama_kv_cache_init(
|
|
1453
1483
|
vram_kv_cache += ggml_nbytes(cache.k);
|
1454
1484
|
}
|
1455
1485
|
if (vram_kv_cache > 0) {
|
1456
|
-
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f
|
1486
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1457
1487
|
}
|
1458
1488
|
}
|
1459
1489
|
#endif
|
@@ -2209,6 +2239,16 @@ static void llm_load_hparams(
|
|
2209
2239
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2210
2240
|
}
|
2211
2241
|
} break;
|
2242
|
+
case LLM_ARCH_STABLELM:
|
2243
|
+
{
|
2244
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2245
|
+
|
2246
|
+
switch (hparams.n_layer) {
|
2247
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2248
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2249
|
+
}
|
2250
|
+
} break;
|
2251
|
+
|
2212
2252
|
default: (void)0;
|
2213
2253
|
}
|
2214
2254
|
|
@@ -2350,6 +2390,23 @@ static void llm_load_vocab(
|
|
2350
2390
|
__func__, key.c_str(), id, old_id);
|
2351
2391
|
id = old_id;
|
2352
2392
|
}
|
2393
|
+
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
// Handle add_bos_token and add_eos_token
|
2397
|
+
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
2398
|
+
int kid = gguf_find_key(ctx, key.c_str());
|
2399
|
+
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2400
|
+
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2401
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2402
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2403
|
+
}
|
2404
|
+
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
2405
|
+
kid = gguf_find_key(ctx, key.c_str());
|
2406
|
+
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2407
|
+
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2408
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2409
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2353
2410
|
}
|
2354
2411
|
}
|
2355
2412
|
|
@@ -2481,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2481
2538
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2482
2539
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2483
2540
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2484
|
-
if (ml.n_bytes <
|
2485
|
-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,
|
2541
|
+
if (ml.n_bytes < GiB) {
|
2542
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2486
2543
|
} else {
|
2487
2544
|
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2488
2545
|
}
|
@@ -2520,7 +2577,7 @@ static void llm_load_tensors(
|
|
2520
2577
|
|
2521
2578
|
ml.calc_sizes(ctx_size, mmapped_size);
|
2522
2579
|
|
2523
|
-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f
|
2580
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2524
2581
|
|
2525
2582
|
// create the ggml context
|
2526
2583
|
{
|
@@ -2872,6 +2929,13 @@ static void llm_load_tensors(
|
|
2872
2929
|
ggml_backend_type backend_output;
|
2873
2930
|
|
2874
2931
|
if (n_gpu_layers > int(n_layer)) {
|
2932
|
+
#ifdef GGML_USE_CUBLAS
|
2933
|
+
if (n_gpu_layers > int(n_layer + 1)) {
|
2934
|
+
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
2935
|
+
__func__, n_layer + 1);
|
2936
|
+
throw std::runtime_error("Persimmon CUDA offload failed");
|
2937
|
+
}
|
2938
|
+
#endif
|
2875
2939
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2876
2940
|
// on Windows however this is detrimental unless everything is on the GPU
|
2877
2941
|
#ifndef _WIN32
|
@@ -3073,6 +3137,81 @@ static void llm_load_tensors(
|
|
3073
3137
|
}
|
3074
3138
|
}
|
3075
3139
|
} break;
|
3140
|
+
case LLM_ARCH_STABLELM:
|
3141
|
+
{
|
3142
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3143
|
+
|
3144
|
+
// output
|
3145
|
+
{
|
3146
|
+
ggml_backend_type backend_norm;
|
3147
|
+
ggml_backend_type backend_output;
|
3148
|
+
|
3149
|
+
if (n_gpu_layers > int(n_layer)) {
|
3150
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3151
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
3152
|
+
#ifndef _WIN32
|
3153
|
+
backend_norm = llama_backend_offload;
|
3154
|
+
#else
|
3155
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3156
|
+
#endif // _WIN32
|
3157
|
+
|
3158
|
+
backend_output = llama_backend_offload_split;
|
3159
|
+
} else {
|
3160
|
+
backend_norm = GGML_BACKEND_CPU;
|
3161
|
+
backend_output = GGML_BACKEND_CPU;
|
3162
|
+
}
|
3163
|
+
|
3164
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3165
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3166
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3167
|
+
|
3168
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3169
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3170
|
+
}
|
3171
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3172
|
+
vram_weights += ggml_nbytes(model.output);
|
3173
|
+
}
|
3174
|
+
}
|
3175
|
+
|
3176
|
+
const uint32_t n_ff = hparams.n_ff;
|
3177
|
+
|
3178
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3179
|
+
|
3180
|
+
model.layers.resize(n_layer);
|
3181
|
+
|
3182
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3183
|
+
/*
|
3184
|
+
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
3185
|
+
*/
|
3186
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3187
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3188
|
+
|
3189
|
+
auto & layer = model.layers[i];
|
3190
|
+
|
3191
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3192
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3193
|
+
|
3194
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
3195
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3196
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3197
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3198
|
+
|
3199
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3200
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
3201
|
+
|
3202
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3203
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3204
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3205
|
+
|
3206
|
+
if (backend == GGML_BACKEND_GPU) {
|
3207
|
+
vram_weights +=
|
3208
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3209
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3210
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3211
|
+
}
|
3212
|
+
}
|
3213
|
+
} break;
|
3214
|
+
|
3076
3215
|
default:
|
3077
3216
|
throw std::runtime_error("unknown architecture");
|
3078
3217
|
}
|
@@ -3087,7 +3226,7 @@ static void llm_load_tensors(
|
|
3087
3226
|
ctx_size +
|
3088
3227
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3089
3228
|
|
3090
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f
|
3229
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
3091
3230
|
|
3092
3231
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3093
3232
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -3106,7 +3245,7 @@ static void llm_load_tensors(
|
|
3106
3245
|
#endif // GGML_USE_CUBLAS
|
3107
3246
|
|
3108
3247
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3109
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f
|
3248
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3110
3249
|
#else
|
3111
3250
|
(void) n_gpu_layers;
|
3112
3251
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -3606,7 +3745,7 @@ struct llm_build_context {
|
|
3606
3745
|
}
|
3607
3746
|
|
3608
3747
|
struct ggml_cgraph * build_llama() {
|
3609
|
-
struct ggml_cgraph * gf =
|
3748
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3610
3749
|
|
3611
3750
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3612
3751
|
|
@@ -3718,7 +3857,7 @@ struct llm_build_context {
|
|
3718
3857
|
}
|
3719
3858
|
|
3720
3859
|
struct ggml_cgraph * build_baichuan() {
|
3721
|
-
struct ggml_cgraph * gf =
|
3860
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3722
3861
|
|
3723
3862
|
struct ggml_tensor * cur;
|
3724
3863
|
struct ggml_tensor * inpL;
|
@@ -3838,7 +3977,7 @@ struct llm_build_context {
|
|
3838
3977
|
}
|
3839
3978
|
|
3840
3979
|
struct ggml_cgraph * build_falcon() {
|
3841
|
-
struct ggml_cgraph * gf =
|
3980
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3842
3981
|
|
3843
3982
|
struct ggml_tensor * cur;
|
3844
3983
|
struct ggml_tensor * inpL;
|
@@ -3960,7 +4099,7 @@ struct llm_build_context {
|
|
3960
4099
|
}
|
3961
4100
|
|
3962
4101
|
struct ggml_cgraph * build_starcoder() {
|
3963
|
-
struct ggml_cgraph * gf =
|
4102
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3964
4103
|
|
3965
4104
|
struct ggml_tensor * cur;
|
3966
4105
|
struct ggml_tensor * pos;
|
@@ -4059,7 +4198,7 @@ struct llm_build_context {
|
|
4059
4198
|
}
|
4060
4199
|
|
4061
4200
|
struct ggml_cgraph * build_persimmon() {
|
4062
|
-
struct ggml_cgraph * gf =
|
4201
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4063
4202
|
|
4064
4203
|
const int64_t n_rot = n_embd_head / 2;
|
4065
4204
|
|
@@ -4204,7 +4343,7 @@ struct llm_build_context {
|
|
4204
4343
|
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4205
4344
|
cb(Kcur, "Kcur", il);
|
4206
4345
|
|
4207
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur,
|
4346
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4208
4347
|
cb(Q, "Q", il);
|
4209
4348
|
|
4210
4349
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
@@ -4269,7 +4408,7 @@ struct llm_build_context {
|
|
4269
4408
|
}
|
4270
4409
|
|
4271
4410
|
struct ggml_cgraph * build_refact() {
|
4272
|
-
struct ggml_cgraph * gf =
|
4411
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4273
4412
|
|
4274
4413
|
struct ggml_tensor * cur;
|
4275
4414
|
struct ggml_tensor * inpL;
|
@@ -4360,7 +4499,7 @@ struct llm_build_context {
|
|
4360
4499
|
}
|
4361
4500
|
|
4362
4501
|
struct ggml_cgraph * build_bloom() {
|
4363
|
-
struct ggml_cgraph * gf =
|
4502
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4364
4503
|
|
4365
4504
|
struct ggml_tensor * cur;
|
4366
4505
|
struct ggml_tensor * inpL;
|
@@ -4454,7 +4593,7 @@ struct llm_build_context {
|
|
4454
4593
|
}
|
4455
4594
|
|
4456
4595
|
struct ggml_cgraph * build_mpt() {
|
4457
|
-
struct ggml_cgraph * gf =
|
4596
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4458
4597
|
|
4459
4598
|
struct ggml_tensor * cur;
|
4460
4599
|
struct ggml_tensor * inpL;
|
@@ -4551,6 +4690,177 @@ struct llm_build_context {
|
|
4551
4690
|
|
4552
4691
|
return gf;
|
4553
4692
|
}
|
4693
|
+
|
4694
|
+
struct ggml_cgraph * build_stablelm() {
|
4695
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4696
|
+
|
4697
|
+
struct ggml_tensor * cur;
|
4698
|
+
struct ggml_tensor * inpL;
|
4699
|
+
|
4700
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4701
|
+
cb(inpL, "inp_embd", -1);
|
4702
|
+
|
4703
|
+
// inp_pos - contains the positions
|
4704
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4705
|
+
cb(inp_pos, "inp_pos", -1);
|
4706
|
+
|
4707
|
+
// KQ_scale
|
4708
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4709
|
+
cb(KQ_scale, "KQ_scale", -1);
|
4710
|
+
|
4711
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4712
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4713
|
+
cb(KQ_mask, "KQ_mask", -1);
|
4714
|
+
|
4715
|
+
// shift the entire K-cache if needed
|
4716
|
+
if (do_rope_shift) {
|
4717
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
|
4718
|
+
}
|
4719
|
+
|
4720
|
+
for (int il = 0; il < n_layer; ++il) {
|
4721
|
+
struct ggml_tensor * inpSA = inpL;
|
4722
|
+
|
4723
|
+
// norm
|
4724
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
4725
|
+
model.layers[il].attn_norm,
|
4726
|
+
model.layers[il].attn_norm_b,
|
4727
|
+
LLM_NORM, cb, il);
|
4728
|
+
cb(cur, "attn_norm", il);
|
4729
|
+
|
4730
|
+
// self-attention
|
4731
|
+
{
|
4732
|
+
// compute Q and K and RoPE them
|
4733
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4734
|
+
cb(tmpq, "tmpq", il);
|
4735
|
+
|
4736
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4737
|
+
cb(tmpk, "tmpk", il);
|
4738
|
+
|
4739
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
|
+
cb(Vcur, "Vcur", il);
|
4741
|
+
|
4742
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4743
|
+
struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
|
4744
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
4745
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4746
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
+
0
|
4748
|
+
));
|
4749
|
+
cb(qrot, "qrot", il);
|
4750
|
+
|
4751
|
+
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
+
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
+
0
|
4756
|
+
));
|
4757
|
+
cb(krot, "krot", il);
|
4758
|
+
|
4759
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
+
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
+
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
+
);
|
4766
|
+
cb(qpass, "qpass", il);
|
4767
|
+
|
4768
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
+
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
+
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
+
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
+
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
+
);
|
4774
|
+
cb(kpass, "kpass", il);
|
4775
|
+
|
4776
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
+
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
+
);
|
4780
|
+
cb(qrotated, "qrotated", il);
|
4781
|
+
|
4782
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
+
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4785
|
+
);
|
4786
|
+
cb(krotated, "krotated", il);
|
4787
|
+
|
4788
|
+
// ggml currently only supports concatenation on dim=2
|
4789
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
+
cb(qrotated, "qrotated", il);
|
4792
|
+
|
4793
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
+
cb(krotated, "krotated", il);
|
4795
|
+
|
4796
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
+
cb(qpass, "qpass", il);
|
4798
|
+
|
4799
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
+
cb(kpass, "kpass", il);
|
4801
|
+
|
4802
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
|
+
cb(Qcur, "Qcur", il);
|
4804
|
+
|
4805
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4806
|
+
cb(Kcur, "Kcur", il);
|
4807
|
+
|
4808
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4809
|
+
cb(Q, "Q", il);
|
4810
|
+
|
4811
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4812
|
+
cb(Kcur, "Kcur", il);
|
4813
|
+
|
4814
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
|
+
|
4816
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
|
+
model.layers[il].wo, NULL,
|
4818
|
+
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
|
+
cb(cur, "kqv_out", il);
|
4820
|
+
}
|
4821
|
+
|
4822
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
4823
|
+
cb(ffn_inp, "ffn_inp", il);
|
4824
|
+
|
4825
|
+
// feed-forward network
|
4826
|
+
{
|
4827
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4828
|
+
model.layers[il].ffn_norm,
|
4829
|
+
model.layers[il].ffn_norm_b,
|
4830
|
+
LLM_NORM, cb, il);
|
4831
|
+
cb(cur, "ffn_norm", il);
|
4832
|
+
|
4833
|
+
cur = llm_build_ffn(ctx0, cur,
|
4834
|
+
model.layers[il].ffn_up, NULL,
|
4835
|
+
model.layers[il].ffn_gate, NULL,
|
4836
|
+
model.layers[il].ffn_down, NULL,
|
4837
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4838
|
+
cb(cur, "ffn_out", il);
|
4839
|
+
}
|
4840
|
+
|
4841
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
4842
|
+
cb(cur, "l_out", il);
|
4843
|
+
|
4844
|
+
// input for next layer
|
4845
|
+
inpL = cur;
|
4846
|
+
}
|
4847
|
+
|
4848
|
+
cur = inpL;
|
4849
|
+
|
4850
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
4851
|
+
model.output_norm,
|
4852
|
+
model.output_norm_b,
|
4853
|
+
LLM_NORM, cb, -1);
|
4854
|
+
cb(cur, "result_norm", -1);
|
4855
|
+
|
4856
|
+
// lm_head
|
4857
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4858
|
+
cb(cur, "result_output", -1);
|
4859
|
+
|
4860
|
+
ggml_build_forward_expand(gf, cur);
|
4861
|
+
|
4862
|
+
return gf;
|
4863
|
+
}
|
4554
4864
|
};
|
4555
4865
|
|
4556
4866
|
//
|
@@ -5020,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5020
5330
|
{
|
5021
5331
|
result = llm.build_mpt();
|
5022
5332
|
} break;
|
5333
|
+
case LLM_ARCH_STABLELM:
|
5334
|
+
{
|
5335
|
+
result = llm.build_stablelm();
|
5336
|
+
} break;
|
5023
5337
|
default:
|
5024
5338
|
GGML_ASSERT(false);
|
5025
5339
|
}
|
@@ -5195,7 +5509,8 @@ static int llama_decode_internal(
|
|
5195
5509
|
model.arch == LLM_ARCH_FALCON ||
|
5196
5510
|
model.arch == LLM_ARCH_REFACT ||
|
5197
5511
|
model.arch == LLM_ARCH_MPT ||
|
5198
|
-
model.arch == LLM_ARCH_STARCODER
|
5512
|
+
model.arch == LLM_ARCH_STARCODER ||
|
5513
|
+
model.arch == LLM_ARCH_STABLELM;
|
5199
5514
|
|
5200
5515
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5201
5516
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -5987,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5987
6302
|
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
5988
6303
|
// and passing 'add space prefix' as bool argument
|
5989
6304
|
//
|
5990
|
-
auto raw_text =
|
6305
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6306
|
+
if (&fragment == &fragment_buffer.front()) {
|
6307
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
6308
|
+
}
|
5991
6309
|
|
5992
6310
|
#ifdef PRETOKENIZERDEBUG
|
5993
6311
|
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
@@ -7639,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7639
7957
|
workers.clear();
|
7640
7958
|
}
|
7641
7959
|
|
7642
|
-
LLAMA_LOG_INFO("size = %8.2f
|
7960
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
7643
7961
|
int64_t tot_count = 0;
|
7644
7962
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
7645
7963
|
hist_all[i] += hist_cur[i];
|
@@ -8179,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8179
8497
|
|
8180
8498
|
{
|
8181
8499
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
8182
|
-
LLAMA_LOG_INFO("%s: kv self size = %7.2f
|
8500
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
8183
8501
|
}
|
8184
8502
|
|
8185
8503
|
// resized during inference
|
@@ -8196,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8196
8514
|
{
|
8197
8515
|
static const size_t tensor_alignment = 32;
|
8198
8516
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
8199
|
-
ctx->buf_compute.resize(ggml_tensor_overhead()*
|
8517
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
8200
8518
|
|
8201
8519
|
// create measure allocator
|
8202
8520
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
@@ -8224,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8224
8542
|
// measure memory requirements for the graph
|
8225
8543
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
8226
8544
|
|
8227
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f
|
8545
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
8228
8546
|
|
8229
8547
|
// recreate allocator with exact memory requirements
|
8230
8548
|
ggml_allocr_free(ctx->alloc);
|
@@ -8238,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8238
8556
|
#endif
|
8239
8557
|
#ifdef GGML_USE_CUBLAS
|
8240
8558
|
ggml_cuda_set_scratch_size(alloc_size);
|
8241
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f
|
8559
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
8242
8560
|
|
8243
8561
|
// calculate total VRAM usage
|
8244
8562
|
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
@@ -8258,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
|
|
8258
8576
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8259
8577
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
8260
8578
|
|
8261
|
-
LLAMA_LOG_INFO("%s: total VRAM used: %.2f
|
8579
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
8262
8580
|
total_vram_size / 1024.0 / 1024.0,
|
8263
8581
|
model_vram_size / 1024.0 / 1024.0,
|
8264
|
-
ctx_vram_size
|
8582
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
8265
8583
|
#endif
|
8266
8584
|
}
|
8267
8585
|
|
@@ -8282,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8282
8600
|
|
8283
8601
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
8284
8602
|
|
8285
|
-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f
|
8603
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
8286
8604
|
|
8287
8605
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
8288
8606
|
if (!(result)) { \
|
@@ -8585,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8585
8903
|
if (kv_buf_size) {
|
8586
8904
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8587
8905
|
|
8588
|
-
ggml_context * cpy_ctx = ggml_init({
|
8589
|
-
ggml_cgraph gf
|
8906
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
8907
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8590
8908
|
|
8591
8909
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8592
8910
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
@@ -8604,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8604
8922
|
kv_head, n_embd, n_layer,
|
8605
8923
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8606
8924
|
|
8607
|
-
ggml_build_forward_expand(
|
8608
|
-
ggml_build_forward_expand(
|
8609
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
8925
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
8926
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
8927
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8610
8928
|
|
8611
8929
|
ggml_free(cpy_ctx);
|
8612
8930
|
|
@@ -8713,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8713
9031
|
|
8714
9032
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8715
9033
|
|
8716
|
-
ggml_context * cpy_ctx = ggml_init({
|
8717
|
-
ggml_cgraph gf
|
9034
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9035
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8718
9036
|
|
8719
9037
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8720
9038
|
kin3d->data = (void *) inp;
|
@@ -8732,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8732
9050
|
kv_head, n_embd, n_layer,
|
8733
9051
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8734
9052
|
|
8735
|
-
ggml_build_forward_expand(
|
8736
|
-
ggml_build_forward_expand(
|
8737
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9053
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9054
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9055
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8738
9056
|
|
8739
9057
|
ggml_free(cpy_ctx);
|
8740
9058
|
}
|
@@ -8989,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
8989
9307
|
return model->vocab.linefeed_id;
|
8990
9308
|
}
|
8991
9309
|
|
9310
|
+
int llama_add_bos_token(const struct llama_model * model) {
|
9311
|
+
return model->vocab.special_add_bos;
|
9312
|
+
}
|
9313
|
+
|
9314
|
+
int llama_add_eos_token(const struct llama_model * model) {
|
9315
|
+
return model->vocab.special_add_eos;
|
9316
|
+
}
|
9317
|
+
|
8992
9318
|
llama_token llama_token_prefix(const struct llama_model * model) {
|
8993
9319
|
return model->vocab.special_prefix_id;
|
8994
9320
|
}
|