cui-llama.rn 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnllama/LlamaContext.java +5 -2
- package/android/src/main/jni.cpp +7 -7
- package/cpp/common.cpp +81 -63
- package/cpp/common.h +79 -62
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend.cpp +59 -24
- package/cpp/ggml-impl.h +8 -0
- package/cpp/ggml.c +65 -23
- package/cpp/ggml.h +1 -0
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +366 -24
- package/cpp/llama-sampling.h +3 -2
- package/cpp/llama-vocab.cpp +33 -9
- package/cpp/llama-vocab.h +30 -11
- package/cpp/llama.cpp +471 -387
- package/cpp/llama.h +52 -21
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +110 -119
- package/cpp/sampling.h +20 -20
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -8,20 +8,20 @@
|
|
8
8
|
#include "ggml-alloc.h"
|
9
9
|
#include "ggml-backend.h"
|
10
10
|
|
11
|
-
#
|
12
|
-
# include "ggml-rpc.h"
|
13
|
-
#endif
|
14
|
-
|
15
|
-
#if defined(LM_GGML_USE_VULKAN)
|
16
|
-
# include "ggml-vulkan.h"
|
17
|
-
#elif defined(LM_GGML_USE_SYCL)
|
18
|
-
# include "ggml-sycl.h"
|
19
|
-
#elif defined(LM_GGML_USE_KOMPUTE)
|
11
|
+
#if defined(LM_GGML_USE_KOMPUTE)
|
20
12
|
# include "ggml-kompute.h"
|
21
13
|
#elif defined(LM_GGML_USE_CANN)
|
22
14
|
# include "ggml-cann.h"
|
23
15
|
#endif
|
24
16
|
|
17
|
+
#ifndef __AMX_INT8__
|
18
|
+
#undef LM_GGML_USE_AMX
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#ifdef LM_GGML_USE_AMX
|
22
|
+
# include "ggml-amx.h"
|
23
|
+
#endif
|
24
|
+
|
25
25
|
// TODO: replace with ggml API call
|
26
26
|
#define QK_K 256
|
27
27
|
|
@@ -360,6 +360,8 @@ enum llm_kv {
|
|
360
360
|
LLM_KV_TOKENIZER_MERGES,
|
361
361
|
LLM_KV_TOKENIZER_BOS_ID,
|
362
362
|
LLM_KV_TOKENIZER_EOS_ID,
|
363
|
+
LLM_KV_TOKENIZER_EOT_ID,
|
364
|
+
LLM_KV_TOKENIZER_EOM_ID,
|
363
365
|
LLM_KV_TOKENIZER_UNK_ID,
|
364
366
|
LLM_KV_TOKENIZER_SEP_ID,
|
365
367
|
LLM_KV_TOKENIZER_PAD_ID,
|
@@ -372,14 +374,20 @@ enum llm_kv {
|
|
372
374
|
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
373
375
|
LLM_KV_TOKENIZER_HF_JSON,
|
374
376
|
LLM_KV_TOKENIZER_RWKV,
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
377
|
+
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
378
|
+
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
379
|
+
LLM_KV_TOKENIZER_FIM_MID_ID,
|
380
|
+
LLM_KV_TOKENIZER_FIM_PAD_ID,
|
381
|
+
LLM_KV_TOKENIZER_FIM_REP_ID,
|
382
|
+
LLM_KV_TOKENIZER_FIM_SEP_ID,
|
380
383
|
|
381
384
|
LLM_KV_ADAPTER_TYPE,
|
382
385
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
386
|
+
|
387
|
+
// deprecated:
|
388
|
+
LLM_KV_TOKENIZER_PREFIX_ID,
|
389
|
+
LLM_KV_TOKENIZER_SUFFIX_ID,
|
390
|
+
LLM_KV_TOKENIZER_MIDDLE_ID,
|
383
391
|
};
|
384
392
|
|
385
393
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -437,57 +445,65 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
437
445
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
438
446
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
439
447
|
|
440
|
-
{ LLM_KV_ROPE_DIMENSION_COUNT,
|
441
|
-
{ LLM_KV_ROPE_FREQ_BASE,
|
442
|
-
{ LLM_KV_ROPE_SCALE_LINEAR,
|
443
|
-
{ LLM_KV_ROPE_SCALING_TYPE,
|
444
|
-
{ LLM_KV_ROPE_SCALING_FACTOR,
|
445
|
-
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
446
|
-
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
447
|
-
{ LLM_KV_ROPE_SCALING_FINETUNED,
|
448
|
-
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
449
|
-
|
450
|
-
{ LLM_KV_SPLIT_NO,
|
451
|
-
{ LLM_KV_SPLIT_COUNT,
|
452
|
-
{ LLM_KV_SPLIT_TENSORS_COUNT,
|
453
|
-
|
454
|
-
{ LLM_KV_SSM_CONV_KERNEL,
|
455
|
-
{ LLM_KV_SSM_INNER_SIZE,
|
456
|
-
{ LLM_KV_SSM_STATE_SIZE,
|
457
|
-
{ LLM_KV_SSM_TIME_STEP_RANK,
|
458
|
-
{ LLM_KV_SSM_DT_B_C_RMS,
|
459
|
-
|
460
|
-
{ LLM_KV_WKV_HEAD_SIZE,
|
461
|
-
|
462
|
-
{ LLM_KV_TOKENIZER_MODEL,
|
463
|
-
{ LLM_KV_TOKENIZER_PRE,
|
464
|
-
{ LLM_KV_TOKENIZER_LIST,
|
465
|
-
{ LLM_KV_TOKENIZER_TOKEN_TYPE,
|
466
|
-
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
467
|
-
{ LLM_KV_TOKENIZER_SCORES,
|
468
|
-
{ LLM_KV_TOKENIZER_MERGES,
|
469
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
470
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
471
|
-
{
|
472
|
-
{
|
473
|
-
{
|
474
|
-
{
|
475
|
-
{
|
476
|
-
{
|
477
|
-
{
|
478
|
-
{
|
479
|
-
{
|
480
|
-
{
|
481
|
-
{
|
482
|
-
{
|
483
|
-
{
|
484
|
-
{
|
485
|
-
{
|
486
|
-
{
|
487
|
-
{
|
488
|
-
|
489
|
-
{
|
490
|
-
{
|
448
|
+
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
449
|
+
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
450
|
+
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
451
|
+
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
452
|
+
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
453
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
454
|
+
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
455
|
+
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
456
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
457
|
+
|
458
|
+
{ LLM_KV_SPLIT_NO, "split.no" },
|
459
|
+
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
460
|
+
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
|
461
|
+
|
462
|
+
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
463
|
+
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
464
|
+
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
465
|
+
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
466
|
+
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
467
|
+
|
468
|
+
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
469
|
+
|
470
|
+
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
471
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
472
|
+
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
473
|
+
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
474
|
+
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
475
|
+
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
476
|
+
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
477
|
+
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
478
|
+
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
479
|
+
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
480
|
+
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
481
|
+
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
482
|
+
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
483
|
+
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
484
|
+
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
485
|
+
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
486
|
+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
487
|
+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
488
|
+
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
489
|
+
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
490
|
+
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
491
|
+
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
492
|
+
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
493
|
+
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
494
|
+
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
495
|
+
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
496
|
+
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
|
497
|
+
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
498
|
+
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
499
|
+
|
500
|
+
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
501
|
+
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
502
|
+
|
503
|
+
// deprecated
|
504
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
505
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
506
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
491
507
|
};
|
492
508
|
|
493
509
|
struct LLM_KV {
|
@@ -2944,9 +2960,6 @@ struct llama_sbatch_seq {
|
|
2944
2960
|
llama_seq_id * seq_id;
|
2945
2961
|
size_t offset;
|
2946
2962
|
size_t length;
|
2947
|
-
|
2948
|
-
// helper for smoother batch API transition -- can be deprecated in the future
|
2949
|
-
llama_seq_id all_seq_id; // used if seq_id == NULL
|
2950
2963
|
};
|
2951
2964
|
|
2952
2965
|
// sequence-length-aware batch splitting
|
@@ -3041,30 +3054,18 @@ struct llama_sbatch {
|
|
3041
3054
|
} else {
|
3042
3055
|
ubatch.embd = nullptr;
|
3043
3056
|
}
|
3044
|
-
|
3045
|
-
// they are helpers for smoother batch API transition
|
3046
|
-
if (batch->pos) {
|
3047
|
-
if (ubatch.equal_seqs) {
|
3048
|
-
for (size_t i = 0; i < length; ++i) {
|
3049
|
-
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
|
3050
|
-
}
|
3051
|
-
} else {
|
3052
|
-
// simple split
|
3053
|
-
ubatch.pos = batch->pos + seq.offset;
|
3054
|
-
}
|
3055
|
-
} else {
|
3057
|
+
if (ubatch.equal_seqs) {
|
3056
3058
|
for (size_t i = 0; i < length; ++i) {
|
3057
|
-
|
3058
|
-
ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1);
|
3059
|
+
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
|
3059
3060
|
}
|
3061
|
+
} else {
|
3062
|
+
// simple split
|
3063
|
+
ubatch.pos = batch->pos + seq.offset;
|
3060
3064
|
}
|
3061
3065
|
if (ubatch.equal_seqs) {
|
3062
3066
|
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
|
3063
3067
|
if (seq.seq_id) {
|
3064
3068
|
ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
|
3065
|
-
} else {
|
3066
|
-
LM_GGML_ASSERT(seq.n_seq_id == 1);
|
3067
|
-
ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id;
|
3068
3069
|
}
|
3069
3070
|
} else {
|
3070
3071
|
// simple split
|
@@ -3077,10 +3078,6 @@ struct llama_sbatch {
|
|
3077
3078
|
}
|
3078
3079
|
if (batch->seq_id) {
|
3079
3080
|
ubatch.seq_id = batch->seq_id + seq.offset;
|
3080
|
-
} else {
|
3081
|
-
for (size_t i = 0; i < length; ++i) {
|
3082
|
-
ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
|
3083
|
-
}
|
3084
3081
|
}
|
3085
3082
|
}
|
3086
3083
|
if (logits_all) {
|
@@ -3199,7 +3196,6 @@ struct llama_sbatch {
|
|
3199
3196
|
s.seq_id = nullptr;
|
3200
3197
|
s.offset = 0;
|
3201
3198
|
s.length = n_tokens;
|
3202
|
-
s.all_seq_id = batch.all_seq_id;
|
3203
3199
|
return;
|
3204
3200
|
}
|
3205
3201
|
std::sort(ids.begin(), ids.end(),
|
@@ -3222,7 +3218,7 @@ struct llama_sbatch {
|
|
3222
3218
|
if (batch.pos) {
|
3223
3219
|
return batch.pos[a] < batch.pos[b];
|
3224
3220
|
}
|
3225
|
-
// no pos, sort by id
|
3221
|
+
// no pos, sort by id
|
3226
3222
|
return a < b;
|
3227
3223
|
}
|
3228
3224
|
// shared prompts go first
|
@@ -3232,30 +3228,25 @@ struct llama_sbatch {
|
|
3232
3228
|
// init seq
|
3233
3229
|
llama_sbatch_seq * last_seq = nullptr;
|
3234
3230
|
|
3235
|
-
|
3236
|
-
|
3237
|
-
|
3238
|
-
|
3239
|
-
|
3240
|
-
|
3241
|
-
|
3242
|
-
|
3243
|
-
|
3244
|
-
same = false;
|
3245
|
-
}
|
3246
|
-
}
|
3247
|
-
if (same) {
|
3248
|
-
last_seq->length += 1;
|
3249
|
-
continue;
|
3231
|
+
for (size_t i = 0; i < n_tokens; ++i) {
|
3232
|
+
const size_t bi = ids[i];
|
3233
|
+
const int32_t n_seqs = batch.n_seq_id[bi];
|
3234
|
+
llama_seq_id * seq_ids = batch.seq_id[bi];
|
3235
|
+
if (last_seq != nullptr) {
|
3236
|
+
bool same = n_seqs == last_seq->n_seq_id;
|
3237
|
+
for (int32_t j = 0; same && j < n_seqs; ++j) {
|
3238
|
+
if (seq_ids[j] != last_seq->seq_id[j]) {
|
3239
|
+
same = false;
|
3250
3240
|
}
|
3251
3241
|
}
|
3252
|
-
|
3253
|
-
|
3254
|
-
|
3242
|
+
if (same) {
|
3243
|
+
last_seq->length += 1;
|
3244
|
+
continue;
|
3245
|
+
}
|
3255
3246
|
}
|
3256
|
-
|
3257
|
-
llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id};
|
3247
|
+
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
|
3258
3248
|
seq.push_back(new_seq);
|
3249
|
+
last_seq = &seq.back();
|
3259
3250
|
}
|
3260
3251
|
// keep shared prompts first at the end, then sort by length descending.
|
3261
3252
|
std::sort(seq.begin(), seq.end(),
|
@@ -3419,11 +3410,7 @@ static int llama_get_device_count(const llama_model & model) {
|
|
3419
3410
|
count += (int) model.rpc_servers.size();
|
3420
3411
|
#endif
|
3421
3412
|
|
3422
|
-
#if defined(
|
3423
|
-
count += lm_ggml_backend_sycl_get_device_count();
|
3424
|
-
#elif defined(LM_GGML_USE_VULKAN)
|
3425
|
-
count += lm_ggml_backend_vk_get_device_count();
|
3426
|
-
#elif defined(LM_GGML_USE_CANN)
|
3413
|
+
#if defined(LM_GGML_USE_CANN)
|
3427
3414
|
count += lm_ggml_backend_cann_get_device_count();
|
3428
3415
|
#endif
|
3429
3416
|
|
@@ -3444,20 +3431,12 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_m
|
|
3444
3431
|
}
|
3445
3432
|
}
|
3446
3433
|
|
3447
|
-
#if defined(
|
3448
|
-
if (host_buffer) {
|
3449
|
-
buft = lm_ggml_backend_sycl_host_buffer_type();
|
3450
|
-
}
|
3451
|
-
#elif defined(LM_GGML_USE_CANN)
|
3434
|
+
#if defined(LM_GGML_USE_CANN)
|
3452
3435
|
if (host_buffer) {
|
3453
3436
|
buft = lm_ggml_backend_cann_host_buffer_type();
|
3454
3437
|
}
|
3455
3438
|
#elif defined(LM_GGML_USE_CPU_HBM)
|
3456
3439
|
buft = lm_ggml_backend_cpu_hbm_buffer_type();
|
3457
|
-
#elif defined(LM_GGML_USE_VULKAN)
|
3458
|
-
if (host_buffer) {
|
3459
|
-
buft = lm_ggml_backend_vk_host_buffer_type();
|
3460
|
-
}
|
3461
3440
|
#endif
|
3462
3441
|
|
3463
3442
|
if (buft == nullptr) {
|
@@ -3471,25 +3450,12 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_m
|
|
3471
3450
|
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
|
3472
3451
|
lm_ggml_backend_buffer_type_t buft = nullptr;
|
3473
3452
|
|
3474
|
-
#if defined(LM_GGML_USE_RPC)
|
3475
|
-
int rpc_count = (int)model.rpc_servers.size();
|
3476
|
-
if (device < rpc_count) {
|
3477
|
-
const char * endpoint = model.rpc_servers[device].c_str();
|
3478
|
-
return lm_ggml_backend_rpc_buffer_type(endpoint);
|
3479
|
-
}
|
3480
|
-
device -= rpc_count;
|
3481
|
-
#endif
|
3482
|
-
|
3483
3453
|
if (device < (int)model.devices.size()) {
|
3484
3454
|
return lm_ggml_backend_dev_buffer_type(model.devices[device]);
|
3485
3455
|
}
|
3486
3456
|
device -= (int)model.devices.size();
|
3487
3457
|
|
3488
|
-
#if defined(
|
3489
|
-
buft = lm_ggml_backend_vk_buffer_type(device);
|
3490
|
-
#elif defined(LM_GGML_USE_SYCL)
|
3491
|
-
buft = lm_ggml_backend_sycl_buffer_type(device);
|
3492
|
-
#elif defined(LM_GGML_USE_KOMPUTE)
|
3458
|
+
#if defined(LM_GGML_USE_KOMPUTE)
|
3493
3459
|
buft = lm_ggml_backend_kompute_buffer_type(device);
|
3494
3460
|
#elif defined(LM_GGML_USE_CANN)
|
3495
3461
|
buft = lm_ggml_backend_cann_buffer_type(device);
|
@@ -3519,12 +3485,6 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
|
|
3519
3485
|
}
|
3520
3486
|
}
|
3521
3487
|
|
3522
|
-
#ifdef LM_GGML_USE_SYCL
|
3523
|
-
if (lm_ggml_backend_sycl_get_device_count() > 1) {
|
3524
|
-
buft = lm_ggml_backend_sycl_split_buffer_type(tensor_split);
|
3525
|
-
}
|
3526
|
-
#endif
|
3527
|
-
|
3528
3488
|
if (buft == nullptr) {
|
3529
3489
|
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
3530
3490
|
}
|
@@ -3534,18 +3494,6 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
|
|
3534
3494
|
}
|
3535
3495
|
|
3536
3496
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
3537
|
-
#if defined(LM_GGML_USE_RPC)
|
3538
|
-
int rpc_count = (int)model.rpc_servers.size();
|
3539
|
-
if (device < rpc_count) {
|
3540
|
-
size_t total;
|
3541
|
-
size_t free;
|
3542
|
-
const char * endpoint = model.rpc_servers[device].c_str();
|
3543
|
-
lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
3544
|
-
return free;
|
3545
|
-
}
|
3546
|
-
device = device - rpc_count;
|
3547
|
-
#endif
|
3548
|
-
|
3549
3497
|
if (device < (int)model.devices.size()) {
|
3550
3498
|
lm_ggml_backend_dev_t dev = model.devices[device];
|
3551
3499
|
size_t total;
|
@@ -3554,17 +3502,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
3554
3502
|
return free;
|
3555
3503
|
}
|
3556
3504
|
|
3557
|
-
#if defined(
|
3558
|
-
size_t total;
|
3559
|
-
size_t free;
|
3560
|
-
lm_ggml_backend_sycl_get_device_memory(device, &free, &total);
|
3561
|
-
return free;
|
3562
|
-
#elif defined(LM_GGML_USE_VULKAN)
|
3563
|
-
size_t total;
|
3564
|
-
size_t free;
|
3565
|
-
lm_ggml_backend_vk_get_device_memory(device, &free, &total);
|
3566
|
-
return free;
|
3567
|
-
#elif defined(LM_GGML_USE_CANN)
|
3505
|
+
#if defined(LM_GGML_USE_CANN)
|
3568
3506
|
size_t total;
|
3569
3507
|
size_t free;
|
3570
3508
|
lm_ggml_backend_cann_get_device_memory(device, &free, &total);
|
@@ -3572,6 +3510,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
3572
3510
|
#else
|
3573
3511
|
return 1;
|
3574
3512
|
#endif
|
3513
|
+
|
3575
3514
|
LM_GGML_UNUSED(model);
|
3576
3515
|
LM_GGML_UNUSED(device);
|
3577
3516
|
}
|
@@ -6204,14 +6143,14 @@ static void llm_load_vocab(
|
|
6204
6143
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
6205
6144
|
|
6206
6145
|
// default special tokens
|
6207
|
-
vocab.special_bos_id =
|
6208
|
-
vocab.special_eos_id =
|
6209
|
-
vocab.special_unk_id =
|
6210
|
-
vocab.special_sep_id =
|
6211
|
-
vocab.special_pad_id =
|
6212
|
-
vocab.special_cls_id =
|
6213
|
-
vocab.special_mask_id =
|
6214
|
-
vocab.linefeed_id =
|
6146
|
+
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
6147
|
+
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
6148
|
+
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
6149
|
+
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
6150
|
+
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
6151
|
+
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
6152
|
+
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
6153
|
+
vocab.linefeed_id = LLAMA_TOKEN_NULL;
|
6215
6154
|
|
6216
6155
|
// read vocab size from metadata
|
6217
6156
|
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
|
@@ -6228,16 +6167,16 @@ static void llm_load_vocab(
|
|
6228
6167
|
vocab.special_bos_id = 1;
|
6229
6168
|
vocab.special_eos_id = 2;
|
6230
6169
|
vocab.special_unk_id = 0;
|
6231
|
-
vocab.special_sep_id =
|
6232
|
-
vocab.special_pad_id =
|
6233
|
-
vocab.special_cls_id =
|
6234
|
-
vocab.special_mask_id =
|
6170
|
+
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
6171
|
+
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
6172
|
+
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
6173
|
+
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
6235
6174
|
} else if (tokenizer_model == "bert") {
|
6236
6175
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
6237
6176
|
|
6238
6177
|
// default special tokens
|
6239
|
-
vocab.special_bos_id =
|
6240
|
-
vocab.special_eos_id =
|
6178
|
+
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
6179
|
+
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
6241
6180
|
vocab.special_unk_id = 100;
|
6242
6181
|
vocab.special_sep_id = 102;
|
6243
6182
|
vocab.special_pad_id = 0;
|
@@ -6273,22 +6212,22 @@ static void llm_load_vocab(
|
|
6273
6212
|
// default special tokens
|
6274
6213
|
vocab.special_bos_id = 11;
|
6275
6214
|
vocab.special_eos_id = 11;
|
6276
|
-
vocab.special_unk_id =
|
6277
|
-
vocab.special_sep_id =
|
6278
|
-
vocab.special_pad_id =
|
6279
|
-
vocab.special_cls_id =
|
6280
|
-
vocab.special_mask_id =
|
6215
|
+
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
6216
|
+
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
6217
|
+
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
6218
|
+
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
6219
|
+
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
6281
6220
|
} else if (tokenizer_model == "t5") {
|
6282
6221
|
vocab.type = LLAMA_VOCAB_TYPE_UGM;
|
6283
6222
|
|
6284
6223
|
// default special tokens
|
6285
|
-
vocab.special_bos_id =
|
6224
|
+
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
6286
6225
|
vocab.special_eos_id = 1;
|
6287
6226
|
vocab.special_unk_id = 2;
|
6288
|
-
vocab.special_sep_id =
|
6227
|
+
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
6289
6228
|
vocab.special_pad_id = 0;
|
6290
|
-
vocab.special_cls_id =
|
6291
|
-
vocab.special_mask_id =
|
6229
|
+
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
6230
|
+
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
6292
6231
|
|
6293
6232
|
const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
6294
6233
|
if (precompiled_charsmap_keyidx != -1) {
|
@@ -6311,11 +6250,11 @@ static void llm_load_vocab(
|
|
6311
6250
|
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
|
6312
6251
|
|
6313
6252
|
// default special tokens
|
6314
|
-
vocab.special_bos_id =
|
6315
|
-
vocab.special_eos_id =
|
6316
|
-
vocab.special_unk_id =
|
6317
|
-
vocab.special_sep_id =
|
6318
|
-
vocab.special_pad_id =
|
6253
|
+
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
6254
|
+
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
6255
|
+
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
6256
|
+
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
6257
|
+
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
6319
6258
|
} else {
|
6320
6259
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
6321
6260
|
}
|
@@ -6399,7 +6338,7 @@ static void llm_load_vocab(
|
|
6399
6338
|
} else if (
|
6400
6339
|
tokenizer_pre == "chatglm-bpe") {
|
6401
6340
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
6402
|
-
vocab.special_bos_id
|
6341
|
+
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
6403
6342
|
} else if (
|
6404
6343
|
tokenizer_pre == "viking") {
|
6405
6344
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
@@ -6525,44 +6464,6 @@ static void llm_load_vocab(
|
|
6525
6464
|
|
6526
6465
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
6527
6466
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
6528
|
-
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
6529
|
-
// prior to support of FIM special tokens in GGUF, the following
|
6530
|
-
// will allow those models to continue to work. The general names
|
6531
|
-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
6532
|
-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
6533
|
-
// new versions of these models have been published.
|
6534
|
-
std::string gen_name;
|
6535
|
-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
6536
|
-
|
6537
|
-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
6538
|
-
[](unsigned char c){ return std::tolower(c); });
|
6539
|
-
|
6540
|
-
if (gen_name.find("code") != std::string::npos) {
|
6541
|
-
if (model.arch == LLM_ARCH_LLAMA
|
6542
|
-
&& 32010 < vocab.id_to_token.size()
|
6543
|
-
&& vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
|
6544
|
-
&& vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
|
6545
|
-
&& vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
|
6546
|
-
&& vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
|
6547
|
-
vocab.special_prefix_id = 32007;
|
6548
|
-
vocab.special_suffix_id = 32008;
|
6549
|
-
vocab.special_middle_id = 32009;
|
6550
|
-
vocab.special_eot_id = 32010;
|
6551
|
-
} else if (model.arch == LLM_ARCH_GEMMA
|
6552
|
-
&& 107 < vocab.id_to_token.size()
|
6553
|
-
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
6554
|
-
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
6555
|
-
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
6556
|
-
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
6557
|
-
vocab.special_prefix_id = 67;
|
6558
|
-
vocab.special_suffix_id = 69;
|
6559
|
-
vocab.special_middle_id = 68;
|
6560
|
-
// TODO: this is not EOT, it is "file separator" token, needs fix
|
6561
|
-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
6562
|
-
//vocab.special_eot_id = 70;
|
6563
|
-
vocab.special_eot_id = 107;
|
6564
|
-
}
|
6565
|
-
}
|
6566
6467
|
try {
|
6567
6468
|
vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
|
6568
6469
|
} catch (const std::exception & e) {
|
@@ -6590,18 +6491,26 @@ static void llm_load_vocab(
|
|
6590
6491
|
// special tokens
|
6591
6492
|
{
|
6592
6493
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
6593
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
6594
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
6595
|
-
{
|
6596
|
-
{
|
6597
|
-
{
|
6598
|
-
{
|
6599
|
-
{
|
6600
|
-
{
|
6601
|
-
{
|
6602
|
-
{
|
6603
|
-
{
|
6604
|
-
{
|
6494
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
6495
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
6496
|
+
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
6497
|
+
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
6498
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
6499
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
6500
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
6501
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
6502
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
6503
|
+
{ LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
|
6504
|
+
{ LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
|
6505
|
+
{ LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
|
6506
|
+
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
|
6507
|
+
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
|
6508
|
+
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
|
6509
|
+
|
6510
|
+
// deprecated
|
6511
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
|
6512
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
|
6513
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
|
6605
6514
|
};
|
6606
6515
|
|
6607
6516
|
for (const auto & it : special_token_types) {
|
@@ -6632,46 +6541,140 @@ static void llm_load_vocab(
|
|
6632
6541
|
}
|
6633
6542
|
}
|
6634
6543
|
|
6635
|
-
//
|
6636
|
-
//
|
6637
|
-
//
|
6638
|
-
|
6639
|
-
|
6640
|
-
|
6544
|
+
// auto-detect special tokens by text
|
6545
|
+
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
|
6546
|
+
// for now, we apply this workaround to find the tokens based on their text
|
6547
|
+
|
6548
|
+
for (const auto & t : vocab.token_to_id) {
|
6549
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
6550
|
+
if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
|
6641
6551
|
if (false
|
6642
|
-
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
6643
|
-
// need to fix convert script
|
6644
|
-
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
6645
6552
|
|| t.first == "<|eot_id|>"
|
6646
6553
|
|| t.first == "<|im_end|>"
|
6647
6554
|
|| t.first == "<|end|>"
|
6648
6555
|
|| t.first == "<end_of_turn>"
|
6649
6556
|
|| t.first == "<|endoftext|>"
|
6650
6557
|
|| t.first == "<EOT>"
|
6558
|
+
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
6651
6559
|
) {
|
6652
6560
|
vocab.special_eot_id = t.second;
|
6653
6561
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6654
|
-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6655
|
-
__func__, t.first.c_str());
|
6562
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6563
|
+
__func__, t.second, t.first.c_str());
|
6564
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6565
|
+
}
|
6566
|
+
}
|
6567
|
+
}
|
6568
|
+
|
6569
|
+
// find EOM token: "<|eom_id|>"
|
6570
|
+
if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
|
6571
|
+
if (false
|
6572
|
+
|| t.first == "<|eom_id|>"
|
6573
|
+
) {
|
6574
|
+
vocab.special_eom_id = t.second;
|
6575
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6576
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6577
|
+
__func__, t.second, t.first.c_str());
|
6578
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6579
|
+
}
|
6580
|
+
}
|
6581
|
+
}
|
6582
|
+
|
6583
|
+
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
|
6584
|
+
if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
|
6585
|
+
if (false
|
6586
|
+
|| t.first == "<|fim_prefix|>" // Qwen
|
6587
|
+
|| t.first == "<fim-prefix>"
|
6588
|
+
|| t.first == "<|fim▁begin|>" // DeepSeek
|
6589
|
+
|| t.first == "<PRE>"
|
6590
|
+
) {
|
6591
|
+
vocab.special_fim_pre_id = t.second;
|
6592
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6593
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6594
|
+
__func__, t.second, t.first.c_str());
|
6595
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6596
|
+
}
|
6597
|
+
}
|
6598
|
+
}
|
6599
|
+
|
6600
|
+
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
|
6601
|
+
if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
|
6602
|
+
if (false
|
6603
|
+
|| t.first == "<|fim_suffix|>" // Qwen
|
6604
|
+
|| t.first == "<fim-suffix>"
|
6605
|
+
|| t.first == "<|fim▁hole|>" // DeepSeek
|
6606
|
+
|| t.first == "<SUF>"
|
6607
|
+
) {
|
6608
|
+
vocab.special_fim_suf_id = t.second;
|
6609
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6610
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6611
|
+
__func__, t.second, t.first.c_str());
|
6612
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6613
|
+
}
|
6614
|
+
}
|
6615
|
+
}
|
6616
|
+
|
6617
|
+
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
|
6618
|
+
if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
|
6619
|
+
if (false
|
6620
|
+
|| t.first == "<|fim_middle|>" // Qwen
|
6621
|
+
|| t.first == "<fim-middle>"
|
6622
|
+
|| t.first == "<|fim▁end|>" // DeepSeek
|
6623
|
+
|| t.first == "<MID>"
|
6624
|
+
) {
|
6625
|
+
vocab.special_fim_mid_id = t.second;
|
6626
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6627
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6628
|
+
__func__, t.second, t.first.c_str());
|
6629
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6630
|
+
}
|
6631
|
+
}
|
6632
|
+
}
|
6633
|
+
|
6634
|
+
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
|
6635
|
+
if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
|
6636
|
+
if (false
|
6637
|
+
|| t.first == "<|fim_pad|>" // Qwen
|
6638
|
+
|| t.first == "<fim-pad>"
|
6639
|
+
|| t.first == "<PAD>"
|
6640
|
+
) {
|
6641
|
+
vocab.special_fim_pad_id = t.second;
|
6642
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6643
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6644
|
+
__func__, t.second, t.first.c_str());
|
6645
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6646
|
+
}
|
6647
|
+
}
|
6648
|
+
}
|
6649
|
+
|
6650
|
+
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
|
6651
|
+
if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
|
6652
|
+
if (false
|
6653
|
+
|| t.first == "<|fim_repo|>" // Qwen
|
6654
|
+
|| t.first == "<|repo_name|>"
|
6655
|
+
|| t.first == "<fim-repo>"
|
6656
|
+
|| t.first == "<REPO>"
|
6657
|
+
) {
|
6658
|
+
vocab.special_fim_rep_id = t.second;
|
6659
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6660
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6661
|
+
__func__, t.second, t.first.c_str());
|
6656
6662
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6657
6663
|
}
|
6658
|
-
break;
|
6659
6664
|
}
|
6660
6665
|
}
|
6661
|
-
}
|
6662
6666
|
|
6663
|
-
|
6664
|
-
|
6665
|
-
|
6666
|
-
|
6667
|
-
|
6668
|
-
|
6669
|
-
|
6670
|
-
|
6671
|
-
|
6672
|
-
|
6673
|
-
|
6674
|
-
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6667
|
+
// find FIM_SEP token: "<|file_sep|>"
|
6668
|
+
if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
|
6669
|
+
if (false
|
6670
|
+
|| t.first == "<|file_sep|>" // Qwen
|
6671
|
+
) {
|
6672
|
+
vocab.special_fim_sep_id = t.second;
|
6673
|
+
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6674
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6675
|
+
__func__, t.second, t.first.c_str());
|
6676
|
+
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6677
|
+
}
|
6675
6678
|
}
|
6676
6679
|
}
|
6677
6680
|
}
|
@@ -6680,6 +6683,19 @@ static void llm_load_vocab(
|
|
6680
6683
|
// this is currently determined based on the token text, which is obviously not ideal
|
6681
6684
|
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
6682
6685
|
vocab.special_eog_ids.clear();
|
6686
|
+
|
6687
|
+
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
|
6688
|
+
vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
|
6689
|
+
}
|
6690
|
+
|
6691
|
+
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
|
6692
|
+
vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
|
6693
|
+
}
|
6694
|
+
|
6695
|
+
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
|
6696
|
+
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
|
6697
|
+
}
|
6698
|
+
|
6683
6699
|
for (const auto & t : vocab.token_to_id) {
|
6684
6700
|
if (false
|
6685
6701
|
|| t.first == "<|eot_id|>"
|
@@ -6692,24 +6708,31 @@ static void llm_load_vocab(
|
|
6692
6708
|
) {
|
6693
6709
|
vocab.special_eog_ids.insert(t.second);
|
6694
6710
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
6695
|
-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6696
|
-
__func__, t.first.c_str());
|
6711
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
6712
|
+
__func__, t.second, t.first.c_str());
|
6697
6713
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
6698
6714
|
}
|
6715
|
+
} else {
|
6716
|
+
// token is control, but not marked as EOG -> print a debug log
|
6717
|
+
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
|
6718
|
+
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
|
6719
|
+
__func__, t.second, t.first.c_str());
|
6720
|
+
}
|
6699
6721
|
}
|
6700
6722
|
}
|
6701
6723
|
|
6702
|
-
|
6724
|
+
// sanity checks
|
6725
|
+
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
|
6703
6726
|
vocab.special_eog_ids.insert(vocab.special_eos_id);
|
6704
6727
|
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
6705
6728
|
}
|
6706
6729
|
|
6707
|
-
if (vocab.special_eot_id !=
|
6730
|
+
if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
|
6708
6731
|
vocab.special_eog_ids.insert(vocab.special_eot_id);
|
6709
6732
|
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
6710
6733
|
}
|
6711
6734
|
|
6712
|
-
if (vocab.special_eom_id !=
|
6735
|
+
if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
|
6713
6736
|
vocab.special_eog_ids.insert(vocab.special_eom_id);
|
6714
6737
|
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
6715
6738
|
}
|
@@ -6903,20 +6926,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
6903
6926
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
6904
6927
|
|
6905
6928
|
// special tokens
|
6906
|
-
if (vocab.special_bos_id
|
6907
|
-
if (vocab.special_eos_id
|
6908
|
-
if (vocab.
|
6909
|
-
if (vocab.
|
6910
|
-
if (vocab.
|
6911
|
-
if (vocab.
|
6912
|
-
if (vocab.
|
6913
|
-
|
6914
|
-
if (vocab.
|
6915
|
-
|
6916
|
-
if (vocab.
|
6917
|
-
|
6918
|
-
if (vocab.
|
6919
|
-
if (vocab.
|
6929
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
6930
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
6931
|
+
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
6932
|
+
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
6933
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
6934
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
6935
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
6936
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
6937
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
6938
|
+
|
6939
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
6940
|
+
|
6941
|
+
if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
|
6942
|
+
if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
|
6943
|
+
if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
|
6944
|
+
if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
|
6945
|
+
if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
|
6946
|
+
if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
|
6920
6947
|
|
6921
6948
|
for (const auto & id : vocab.special_eog_ids) {
|
6922
6949
|
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
@@ -6982,7 +7009,14 @@ static bool llm_load_tensors(
|
|
6982
7009
|
|
6983
7010
|
// assign cpu layers
|
6984
7011
|
for (int i = 0; i < i_gpu_start; ++i) {
|
7012
|
+
#ifdef LM_GGML_USE_AMX
|
7013
|
+
model.buft_layer[i] = {
|
7014
|
+
lm_ggml_backend_amx_buffer_type(),
|
7015
|
+
llama_default_buffer_type_cpu(model, true)
|
7016
|
+
};
|
7017
|
+
#else
|
6985
7018
|
model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
|
7019
|
+
#endif
|
6986
7020
|
}
|
6987
7021
|
|
6988
7022
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
@@ -16031,9 +16065,11 @@ struct llm_build_context {
|
|
16031
16065
|
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
16032
16066
|
|
16033
16067
|
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
|
16034
|
-
cur
|
16068
|
+
cb(cur, "result_norm", -1);
|
16035
16069
|
|
16070
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
16036
16071
|
cb(cur, "result_output", -1);
|
16072
|
+
|
16037
16073
|
lm_ggml_build_forward_expand(gf, cur);
|
16038
16074
|
|
16039
16075
|
return gf;
|
@@ -17083,10 +17119,10 @@ static void llama_graph_compute(
|
|
17083
17119
|
//
|
17084
17120
|
static int llama_decode_internal(
|
17085
17121
|
llama_context & lctx,
|
17086
|
-
llama_batch
|
17122
|
+
llama_batch batch) {
|
17087
17123
|
|
17088
17124
|
lctx.is_encoding = false;
|
17089
|
-
const uint32_t n_tokens_all =
|
17125
|
+
const uint32_t n_tokens_all = batch.n_tokens;
|
17090
17126
|
|
17091
17127
|
if (n_tokens_all == 0) {
|
17092
17128
|
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
@@ -17097,12 +17133,12 @@ static int llama_decode_internal(
|
|
17097
17133
|
const auto & hparams = model.hparams;
|
17098
17134
|
const auto & cparams = lctx.cparams;
|
17099
17135
|
|
17100
|
-
LM_GGML_ASSERT((!
|
17136
|
+
LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
17101
17137
|
|
17102
|
-
if (
|
17138
|
+
if (batch.token) {
|
17103
17139
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
17104
|
-
if (
|
17105
|
-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i,
|
17140
|
+
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
|
17141
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
17106
17142
|
return -1;
|
17107
17143
|
}
|
17108
17144
|
}
|
@@ -17133,9 +17169,9 @@ static int llama_decode_internal(
|
|
17133
17169
|
lctx.embd_seq.clear();
|
17134
17170
|
|
17135
17171
|
// count outputs
|
17136
|
-
if (
|
17172
|
+
if (batch.logits && !embd_pooled) {
|
17137
17173
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
17138
|
-
n_outputs +=
|
17174
|
+
n_outputs += batch.logits[i] != 0;
|
17139
17175
|
}
|
17140
17176
|
} else if (lctx.logits_all || embd_pooled) {
|
17141
17177
|
n_outputs = n_tokens_all;
|
@@ -17144,7 +17180,7 @@ static int llama_decode_internal(
|
|
17144
17180
|
n_outputs = 1;
|
17145
17181
|
}
|
17146
17182
|
|
17147
|
-
lctx.sbatch.from_batch(
|
17183
|
+
lctx.sbatch.from_batch(batch, n_embd,
|
17148
17184
|
/* simple_split */ !kv_self.recurrent,
|
17149
17185
|
/* logits_all */ n_outputs == n_tokens_all);
|
17150
17186
|
|
@@ -19029,16 +19065,20 @@ bool llama_supports_mlock(void) {
|
|
19029
19065
|
}
|
19030
19066
|
|
19031
19067
|
bool llama_supports_gpu_offload(void) {
|
19032
|
-
#if defined(
|
19033
|
-
defined(LM_GGML_USE_SYCL) || defined(LM_GGML_USE_KOMPUTE) || defined(LM_GGML_USE_RPC)
|
19068
|
+
#if defined(LM_GGML_USE_KOMPUTE)
|
19034
19069
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
19035
19070
|
return true;
|
19036
19071
|
#else
|
19037
19072
|
return lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
19038
|
-
|
19073
|
+
lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
|
19074
|
+
llama_supports_rpc();
|
19039
19075
|
#endif
|
19040
19076
|
}
|
19041
19077
|
|
19078
|
+
bool llama_supports_rpc(void) {
|
19079
|
+
return lm_ggml_backend_reg_by_name("RPC") != nullptr;
|
19080
|
+
}
|
19081
|
+
|
19042
19082
|
void llama_backend_init(void) {
|
19043
19083
|
lm_ggml_time_init();
|
19044
19084
|
|
@@ -19113,6 +19153,36 @@ struct llama_model * llama_load_model_from_file(
|
|
19113
19153
|
model->rpc_servers.push_back(servers);
|
19114
19154
|
}
|
19115
19155
|
|
19156
|
+
// add RPC devices
|
19157
|
+
if (!model->rpc_servers.empty()) {
|
19158
|
+
lm_ggml_backend_reg_t rpc_reg = lm_ggml_backend_reg_by_name("RPC");
|
19159
|
+
if (!rpc_reg) {
|
19160
|
+
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
|
19161
|
+
llama_free_model(model);
|
19162
|
+
return nullptr;
|
19163
|
+
}
|
19164
|
+
|
19165
|
+
// lm_ggml_backend_dev_t lm_ggml_backend_rpc_add_device(const char * endpoint);
|
19166
|
+
using lm_ggml_backend_rpc_add_device_t = lm_ggml_backend_dev_t (*)(const char *);
|
19167
|
+
lm_ggml_backend_rpc_add_device_t lm_ggml_backend_rpc_add_device_fn = (lm_ggml_backend_rpc_add_device_t) lm_ggml_backend_reg_get_proc_address(rpc_reg, "lm_ggml_backend_rpc_add_device");
|
19168
|
+
if (!lm_ggml_backend_rpc_add_device_fn) {
|
19169
|
+
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
|
19170
|
+
llama_free_model(model);
|
19171
|
+
return nullptr;
|
19172
|
+
}
|
19173
|
+
|
19174
|
+
for (const std::string & server : model->rpc_servers) {
|
19175
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_rpc_add_device_fn(server.c_str());
|
19176
|
+
if (dev) {
|
19177
|
+
model->devices.push_back(dev);
|
19178
|
+
} else {
|
19179
|
+
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
19180
|
+
llama_free_model(model);
|
19181
|
+
return nullptr;
|
19182
|
+
}
|
19183
|
+
}
|
19184
|
+
}
|
19185
|
+
|
19116
19186
|
// create list of devices to use with this model
|
19117
19187
|
// currently, we use all available devices
|
19118
19188
|
// TODO: rework API to give user more control over device selection
|
@@ -19126,8 +19196,13 @@ struct llama_model * llama_load_model_from_file(
|
|
19126
19196
|
|
19127
19197
|
case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
|
19128
19198
|
case LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
19199
|
+
{
|
19200
|
+
size_t free, total; // NOLINT
|
19201
|
+
lm_ggml_backend_dev_memory(dev, &free, &total);
|
19202
|
+
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, lm_ggml_backend_dev_name(dev), lm_ggml_backend_dev_description(dev), free/1024/1024);
|
19129
19203
|
model->devices.push_back(dev);
|
19130
19204
|
break;
|
19205
|
+
}
|
19131
19206
|
}
|
19132
19207
|
}
|
19133
19208
|
|
@@ -19139,7 +19214,7 @@ struct llama_model * llama_load_model_from_file(
|
|
19139
19214
|
} else if (status == -2) {
|
19140
19215
|
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
19141
19216
|
}
|
19142
|
-
|
19217
|
+
llama_free_model(model);
|
19143
19218
|
return nullptr;
|
19144
19219
|
}
|
19145
19220
|
|
@@ -19179,7 +19254,7 @@ struct llama_context * llama_new_context_with_model(
|
|
19179
19254
|
params.flash_attn = false;
|
19180
19255
|
}
|
19181
19256
|
|
19182
|
-
if (params.type_v
|
19257
|
+
if (lm_ggml_is_quantized(params.type_v) && !params.flash_attn) {
|
19183
19258
|
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
19184
19259
|
return nullptr;
|
19185
19260
|
}
|
@@ -19322,71 +19397,7 @@ struct llama_context * llama_new_context_with_model(
|
|
19322
19397
|
main_gpu -= (int)model->devices.size();
|
19323
19398
|
}
|
19324
19399
|
|
19325
|
-
#if defined(
|
19326
|
-
if (model->n_gpu_layers > 0) {
|
19327
|
-
for (const auto & endpoint : model->rpc_servers) {
|
19328
|
-
lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
|
19329
|
-
if (backend == nullptr) {
|
19330
|
-
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
19331
|
-
llama_free(ctx);
|
19332
|
-
return nullptr;
|
19333
|
-
}
|
19334
|
-
ctx->backends.push_back(backend);
|
19335
|
-
}
|
19336
|
-
}
|
19337
|
-
if (main_gpu >= (int)model->rpc_servers.size()) {
|
19338
|
-
main_gpu -= (int)model->rpc_servers.size();
|
19339
|
-
}
|
19340
|
-
#endif
|
19341
|
-
|
19342
|
-
#if defined(LM_GGML_USE_VULKAN)
|
19343
|
-
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
19344
|
-
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
19345
|
-
llama_free(ctx);
|
19346
|
-
return nullptr;
|
19347
|
-
}
|
19348
|
-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
19349
|
-
lm_ggml_backend_t backend = lm_ggml_backend_vk_init(main_gpu);
|
19350
|
-
if (backend == nullptr) {
|
19351
|
-
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
19352
|
-
llama_free(ctx);
|
19353
|
-
return nullptr;
|
19354
|
-
}
|
19355
|
-
ctx->backends.push_back(backend);
|
19356
|
-
} else {
|
19357
|
-
for (int device = 0; device < lm_ggml_backend_vk_get_device_count(); ++device) {
|
19358
|
-
lm_ggml_backend_t backend = lm_ggml_backend_vk_init(device);
|
19359
|
-
if (backend == nullptr) {
|
19360
|
-
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
19361
|
-
llama_free(ctx);
|
19362
|
-
return nullptr;
|
19363
|
-
}
|
19364
|
-
ctx->backends.push_back(backend);
|
19365
|
-
}
|
19366
|
-
}
|
19367
|
-
#elif defined(LM_GGML_USE_SYCL)
|
19368
|
-
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
19369
|
-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
19370
|
-
lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(main_gpu);
|
19371
|
-
if (backend == nullptr) {
|
19372
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
|
19373
|
-
llama_free(ctx);
|
19374
|
-
return nullptr;
|
19375
|
-
}
|
19376
|
-
ctx->backends.push_back(backend);
|
19377
|
-
} else {
|
19378
|
-
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
19379
|
-
for (int i = 0; i < lm_ggml_backend_sycl_get_device_count(); ++i) {
|
19380
|
-
lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(i);
|
19381
|
-
if (backend == nullptr) {
|
19382
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
|
19383
|
-
llama_free(ctx);
|
19384
|
-
return nullptr;
|
19385
|
-
}
|
19386
|
-
ctx->backends.push_back(backend);
|
19387
|
-
}
|
19388
|
-
}
|
19389
|
-
#elif defined(LM_GGML_USE_KOMPUTE)
|
19400
|
+
#if defined(LM_GGML_USE_KOMPUTE)
|
19390
19401
|
if (model->n_gpu_layers > 0) {
|
19391
19402
|
auto * backend = lm_ggml_backend_kompute_init(main_gpu);
|
19392
19403
|
if (backend == nullptr) {
|
@@ -19475,7 +19486,7 @@ struct llama_context * llama_new_context_with_model(
|
|
19475
19486
|
}
|
19476
19487
|
|
19477
19488
|
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
19478
|
-
|
19489
|
+
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
19479
19490
|
lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
19480
19491
|
lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
19481
19492
|
}
|
@@ -21071,9 +21082,7 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
|
21071
21082
|
|
21072
21083
|
struct llama_batch llama_batch_get_one(
|
21073
21084
|
llama_token * tokens,
|
21074
|
-
int32_t n_tokens
|
21075
|
-
llama_pos pos_0,
|
21076
|
-
llama_seq_id seq_id) {
|
21085
|
+
int32_t n_tokens) {
|
21077
21086
|
return {
|
21078
21087
|
/*n_tokens =*/ n_tokens,
|
21079
21088
|
/*tokens =*/ tokens,
|
@@ -21082,9 +21091,6 @@ struct llama_batch llama_batch_get_one(
|
|
21082
21091
|
/*n_seq_id =*/ nullptr,
|
21083
21092
|
/*seq_id =*/ nullptr,
|
21084
21093
|
/*logits =*/ nullptr,
|
21085
|
-
/*all_pos_0 =*/ pos_0,
|
21086
|
-
/*all_pos_1 =*/ 1,
|
21087
|
-
/*all_seq_id =*/ seq_id,
|
21088
21094
|
};
|
21089
21095
|
}
|
21090
21096
|
|
@@ -21097,9 +21103,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
|
21097
21103
|
/*n_seq_id =*/ nullptr,
|
21098
21104
|
/*seq_id =*/ nullptr,
|
21099
21105
|
/*logits =*/ nullptr,
|
21100
|
-
/*all_pos_0 =*/ 0,
|
21101
|
-
/*all_pos_1 =*/ 0,
|
21102
|
-
/*all_seq_id =*/ 0,
|
21103
21106
|
};
|
21104
21107
|
|
21105
21108
|
if (embd) {
|
@@ -21135,11 +21138,62 @@ void llama_batch_free(struct llama_batch batch) {
|
|
21135
21138
|
if (batch.logits) free(batch.logits);
|
21136
21139
|
}
|
21137
21140
|
|
21141
|
+
// temporary allocate memory for the input batch if needed
|
21142
|
+
static const llama_seq_id batch_default_seq_id = 0;
|
21143
|
+
struct llama_batch_allocr {
|
21144
|
+
std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
|
21145
|
+
std::vector<llama_pos> pos;
|
21146
|
+
std::vector<int32_t> n_seq_id;
|
21147
|
+
std::vector<llama_seq_id *> seq_id;
|
21148
|
+
std::vector<int8_t> logits;
|
21149
|
+
struct llama_batch batch;
|
21150
|
+
// optionally fulfill the batch returned by llama_batch_get_one
|
21151
|
+
llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
|
21152
|
+
batch = in_batch;
|
21153
|
+
if (!batch.pos) {
|
21154
|
+
// determine the last position in KV cache
|
21155
|
+
llama_pos last_pos = -1;
|
21156
|
+
for (const auto & cell : ctx->kv_self.cells) {
|
21157
|
+
if (cell.has_seq_id(batch_default_seq_id)) {
|
21158
|
+
last_pos = std::max(last_pos, cell.pos);
|
21159
|
+
}
|
21160
|
+
}
|
21161
|
+
last_pos++; // next position
|
21162
|
+
pos.resize(batch.n_tokens);
|
21163
|
+
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
21164
|
+
pos[i] = i+last_pos;
|
21165
|
+
}
|
21166
|
+
batch.pos = pos.data();
|
21167
|
+
}
|
21168
|
+
if (!batch.n_seq_id) {
|
21169
|
+
n_seq_id.resize(batch.n_tokens);
|
21170
|
+
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
21171
|
+
n_seq_id[i] = seq_id_0.size();
|
21172
|
+
}
|
21173
|
+
batch.n_seq_id = n_seq_id.data();
|
21174
|
+
}
|
21175
|
+
if (!batch.seq_id) {
|
21176
|
+
seq_id.resize(batch.n_tokens + 1);
|
21177
|
+
seq_id[batch.n_tokens] = NULL;
|
21178
|
+
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
21179
|
+
seq_id[i] = seq_id_0.data();
|
21180
|
+
}
|
21181
|
+
batch.seq_id = seq_id.data();
|
21182
|
+
}
|
21183
|
+
if (!batch.logits) {
|
21184
|
+
logits.resize(batch.n_tokens);
|
21185
|
+
logits[logits.size() - 1] = true;
|
21186
|
+
batch.logits = logits.data();
|
21187
|
+
}
|
21188
|
+
}
|
21189
|
+
};
|
21190
|
+
|
21138
21191
|
int32_t llama_encode(
|
21139
21192
|
struct llama_context * ctx,
|
21140
21193
|
struct llama_batch batch) {
|
21141
|
-
|
21142
|
-
|
21194
|
+
llama_batch_allocr batch_allocr(ctx, batch);
|
21195
|
+
const int ret = llama_encode_internal(*ctx, batch_allocr.batch);
|
21196
|
+
if (ret != 0) {
|
21143
21197
|
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
|
21144
21198
|
}
|
21145
21199
|
|
@@ -21149,8 +21203,9 @@ int32_t llama_encode(
|
|
21149
21203
|
int32_t llama_decode(
|
21150
21204
|
struct llama_context * ctx,
|
21151
21205
|
struct llama_batch batch) {
|
21152
|
-
|
21153
|
-
|
21206
|
+
llama_batch_allocr batch_allocr(ctx, batch);
|
21207
|
+
const int ret = llama_decode_internal(*ctx, batch_allocr.batch);
|
21208
|
+
if (ret != 0) {
|
21154
21209
|
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
21155
21210
|
}
|
21156
21211
|
|
@@ -21329,6 +21384,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
|
21329
21384
|
return llama_token_eos_impl(model->vocab);
|
21330
21385
|
}
|
21331
21386
|
|
21387
|
+
llama_token llama_token_eot(const struct llama_model * model) {
|
21388
|
+
return llama_token_eot_impl(model->vocab);
|
21389
|
+
}
|
21390
|
+
|
21332
21391
|
llama_token llama_token_cls(const struct llama_model * model) {
|
21333
21392
|
return llama_token_cls_impl(model->vocab);
|
21334
21393
|
}
|
@@ -21365,8 +21424,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
|
|
21365
21424
|
return llama_token_suffix_impl(model->vocab);
|
21366
21425
|
}
|
21367
21426
|
|
21368
|
-
llama_token
|
21369
|
-
return
|
21427
|
+
llama_token llama_token_fim_pre(const struct llama_model * model) {
|
21428
|
+
return llama_token_fim_pre_impl(model->vocab);
|
21429
|
+
}
|
21430
|
+
|
21431
|
+
llama_token llama_token_fim_suf(const struct llama_model * model) {
|
21432
|
+
return llama_token_fim_suf_impl(model->vocab);
|
21433
|
+
}
|
21434
|
+
|
21435
|
+
llama_token llama_token_fim_mid(const struct llama_model * model) {
|
21436
|
+
return llama_token_fim_mid_impl(model->vocab);
|
21437
|
+
}
|
21438
|
+
|
21439
|
+
llama_token llama_token_fim_pad(const struct llama_model * model) {
|
21440
|
+
return llama_token_fim_pad_impl(model->vocab);
|
21441
|
+
}
|
21442
|
+
|
21443
|
+
llama_token llama_token_fim_rep(const struct llama_model * model) {
|
21444
|
+
return llama_token_fim_rep_impl(model->vocab);
|
21445
|
+
}
|
21446
|
+
|
21447
|
+
llama_token llama_token_fim_sep(const struct llama_model * model) {
|
21448
|
+
return llama_token_fim_sep_impl(model->vocab);
|
21370
21449
|
}
|
21371
21450
|
|
21372
21451
|
//
|
@@ -21724,6 +21803,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
|
|
21724
21803
|
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
21725
21804
|
}
|
21726
21805
|
|
21806
|
+
struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
|
21807
|
+
return llama_sampler_init_infill_impl(model->vocab);
|
21808
|
+
}
|
21809
|
+
|
21727
21810
|
//
|
21728
21811
|
// model split
|
21729
21812
|
//
|
@@ -21763,6 +21846,7 @@ const char * llama_print_system_info(void) {
|
|
21763
21846
|
s += "AVX512_VBMI = " + std::to_string(lm_ggml_cpu_has_avx512_vbmi()) + " | ";
|
21764
21847
|
s += "AVX512_VNNI = " + std::to_string(lm_ggml_cpu_has_avx512_vnni()) + " | ";
|
21765
21848
|
s += "AVX512_BF16 = " + std::to_string(lm_ggml_cpu_has_avx512_bf16()) + " | ";
|
21849
|
+
s += "AMX_INT8 = " + std::to_string(lm_ggml_cpu_has_amx_int8()) + " | ";
|
21766
21850
|
s += "FMA = " + std::to_string(lm_ggml_cpu_has_fma()) + " | ";
|
21767
21851
|
s += "NEON = " + std::to_string(lm_ggml_cpu_has_neon()) + " | ";
|
21768
21852
|
s += "SVE = " + std::to_string(lm_ggml_cpu_has_sve()) + " | ";
|