cui-llama.rn 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -8,20 +8,20 @@
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
10
 
11
- #ifdef LM_GGML_USE_RPC
12
- # include "ggml-rpc.h"
13
- #endif
14
-
15
- #if defined(LM_GGML_USE_VULKAN)
16
- # include "ggml-vulkan.h"
17
- #elif defined(LM_GGML_USE_SYCL)
18
- # include "ggml-sycl.h"
19
- #elif defined(LM_GGML_USE_KOMPUTE)
11
+ #if defined(LM_GGML_USE_KOMPUTE)
20
12
  # include "ggml-kompute.h"
21
13
  #elif defined(LM_GGML_USE_CANN)
22
14
  # include "ggml-cann.h"
23
15
  #endif
24
16
 
17
+ #ifndef __AMX_INT8__
18
+ #undef LM_GGML_USE_AMX
19
+ #endif
20
+
21
+ #ifdef LM_GGML_USE_AMX
22
+ # include "ggml-amx.h"
23
+ #endif
24
+
25
25
  // TODO: replace with ggml API call
26
26
  #define QK_K 256
27
27
 
@@ -360,6 +360,8 @@ enum llm_kv {
360
360
  LLM_KV_TOKENIZER_MERGES,
361
361
  LLM_KV_TOKENIZER_BOS_ID,
362
362
  LLM_KV_TOKENIZER_EOS_ID,
363
+ LLM_KV_TOKENIZER_EOT_ID,
364
+ LLM_KV_TOKENIZER_EOM_ID,
363
365
  LLM_KV_TOKENIZER_UNK_ID,
364
366
  LLM_KV_TOKENIZER_SEP_ID,
365
367
  LLM_KV_TOKENIZER_PAD_ID,
@@ -372,14 +374,20 @@ enum llm_kv {
372
374
  LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
373
375
  LLM_KV_TOKENIZER_HF_JSON,
374
376
  LLM_KV_TOKENIZER_RWKV,
375
- LLM_KV_TOKENIZER_PREFIX_ID,
376
- LLM_KV_TOKENIZER_SUFFIX_ID,
377
- LLM_KV_TOKENIZER_MIDDLE_ID,
378
- LLM_KV_TOKENIZER_EOT_ID,
379
- LLM_KV_TOKENIZER_EOM_ID,
377
+ LLM_KV_TOKENIZER_FIM_PRE_ID,
378
+ LLM_KV_TOKENIZER_FIM_SUF_ID,
379
+ LLM_KV_TOKENIZER_FIM_MID_ID,
380
+ LLM_KV_TOKENIZER_FIM_PAD_ID,
381
+ LLM_KV_TOKENIZER_FIM_REP_ID,
382
+ LLM_KV_TOKENIZER_FIM_SEP_ID,
380
383
 
381
384
  LLM_KV_ADAPTER_TYPE,
382
385
  LLM_KV_ADAPTER_LORA_ALPHA,
386
+
387
+ // deprecated:
388
+ LLM_KV_TOKENIZER_PREFIX_ID,
389
+ LLM_KV_TOKENIZER_SUFFIX_ID,
390
+ LLM_KV_TOKENIZER_MIDDLE_ID,
383
391
  };
384
392
 
385
393
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -437,57 +445,65 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
437
445
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
438
446
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
439
447
 
440
- { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
441
- { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
442
- { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
443
- { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
444
- { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
445
- { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
446
- { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
447
- { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
448
- { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
449
-
450
- { LLM_KV_SPLIT_NO, "split.no" },
451
- { LLM_KV_SPLIT_COUNT, "split.count" },
452
- { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
453
-
454
- { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
455
- { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
456
- { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
457
- { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
458
- { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
459
-
460
- { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
461
-
462
- { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
463
- { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
464
- { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
465
- { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
466
- { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
467
- { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
468
- { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
469
- { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
470
- { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
471
- { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
472
- { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
473
- { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
474
- { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
475
- { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
476
- { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
477
- { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
478
- { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
479
- { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
480
- { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
481
- { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
482
- { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
483
- { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
484
- { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
485
- { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
486
- { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
487
- { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
488
-
489
- { LLM_KV_ADAPTER_TYPE, "adapter.type" },
490
- { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
448
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
449
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
450
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
451
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
452
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
453
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
454
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
455
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
456
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
457
+
458
+ { LLM_KV_SPLIT_NO, "split.no" },
459
+ { LLM_KV_SPLIT_COUNT, "split.count" },
460
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
461
+
462
+ { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
463
+ { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
464
+ { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
465
+ { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
466
+ { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
467
+
468
+ { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
469
+
470
+ { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
471
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
472
+ { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
473
+ { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
474
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
475
+ { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
476
+ { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
477
+ { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
478
+ { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
479
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
480
+ { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
481
+ { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
482
+ { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
483
+ { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
484
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
485
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
486
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
487
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
488
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
489
+ { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
490
+ { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
491
+ { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
492
+ { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
493
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
494
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
495
+ { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
496
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
497
+ { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
498
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
499
+
500
+ { LLM_KV_ADAPTER_TYPE, "adapter.type" },
501
+ { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
502
+
503
+ // deprecated
504
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
505
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
506
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
491
507
  };
492
508
 
493
509
  struct LLM_KV {
@@ -2944,9 +2960,6 @@ struct llama_sbatch_seq {
2944
2960
  llama_seq_id * seq_id;
2945
2961
  size_t offset;
2946
2962
  size_t length;
2947
-
2948
- // helper for smoother batch API transition -- can be deprecated in the future
2949
- llama_seq_id all_seq_id; // used if seq_id == NULL
2950
2963
  };
2951
2964
 
2952
2965
  // sequence-length-aware batch splitting
@@ -3041,30 +3054,18 @@ struct llama_sbatch {
3041
3054
  } else {
3042
3055
  ubatch.embd = nullptr;
3043
3056
  }
3044
- // from here on, the else branches are deprecated;
3045
- // they are helpers for smoother batch API transition
3046
- if (batch->pos) {
3047
- if (ubatch.equal_seqs) {
3048
- for (size_t i = 0; i < length; ++i) {
3049
- ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
3050
- }
3051
- } else {
3052
- // simple split
3053
- ubatch.pos = batch->pos + seq.offset;
3054
- }
3055
- } else {
3057
+ if (ubatch.equal_seqs) {
3056
3058
  for (size_t i = 0; i < length; ++i) {
3057
- llama_pos bi = ids[seq.offset + i];
3058
- ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1);
3059
+ ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
3059
3060
  }
3061
+ } else {
3062
+ // simple split
3063
+ ubatch.pos = batch->pos + seq.offset;
3060
3064
  }
3061
3065
  if (ubatch.equal_seqs) {
3062
3066
  ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
3063
3067
  if (seq.seq_id) {
3064
3068
  ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
3065
- } else {
3066
- LM_GGML_ASSERT(seq.n_seq_id == 1);
3067
- ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id;
3068
3069
  }
3069
3070
  } else {
3070
3071
  // simple split
@@ -3077,10 +3078,6 @@ struct llama_sbatch {
3077
3078
  }
3078
3079
  if (batch->seq_id) {
3079
3080
  ubatch.seq_id = batch->seq_id + seq.offset;
3080
- } else {
3081
- for (size_t i = 0; i < length; ++i) {
3082
- ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
3083
- }
3084
3081
  }
3085
3082
  }
3086
3083
  if (logits_all) {
@@ -3199,7 +3196,6 @@ struct llama_sbatch {
3199
3196
  s.seq_id = nullptr;
3200
3197
  s.offset = 0;
3201
3198
  s.length = n_tokens;
3202
- s.all_seq_id = batch.all_seq_id;
3203
3199
  return;
3204
3200
  }
3205
3201
  std::sort(ids.begin(), ids.end(),
@@ -3222,7 +3218,7 @@ struct llama_sbatch {
3222
3218
  if (batch.pos) {
3223
3219
  return batch.pos[a] < batch.pos[b];
3224
3220
  }
3225
- // no pos, sort by id (assuming batch.all_pos_1 is positive)
3221
+ // no pos, sort by id
3226
3222
  return a < b;
3227
3223
  }
3228
3224
  // shared prompts go first
@@ -3232,30 +3228,25 @@ struct llama_sbatch {
3232
3228
  // init seq
3233
3229
  llama_sbatch_seq * last_seq = nullptr;
3234
3230
 
3235
- if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) {
3236
- for (size_t i = 0; i < n_tokens; ++i) {
3237
- const size_t bi = ids[i];
3238
- const int32_t n_seqs = batch.n_seq_id[bi];
3239
- llama_seq_id * seq_ids = batch.seq_id[bi];
3240
- if (last_seq != nullptr) {
3241
- bool same = n_seqs == last_seq->n_seq_id;
3242
- for (int32_t j = 0; same && j < n_seqs; ++j) {
3243
- if (seq_ids[j] != last_seq->seq_id[j]) {
3244
- same = false;
3245
- }
3246
- }
3247
- if (same) {
3248
- last_seq->length += 1;
3249
- continue;
3231
+ for (size_t i = 0; i < n_tokens; ++i) {
3232
+ const size_t bi = ids[i];
3233
+ const int32_t n_seqs = batch.n_seq_id[bi];
3234
+ llama_seq_id * seq_ids = batch.seq_id[bi];
3235
+ if (last_seq != nullptr) {
3236
+ bool same = n_seqs == last_seq->n_seq_id;
3237
+ for (int32_t j = 0; same && j < n_seqs; ++j) {
3238
+ if (seq_ids[j] != last_seq->seq_id[j]) {
3239
+ same = false;
3250
3240
  }
3251
3241
  }
3252
- llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id};
3253
- seq.push_back(new_seq);
3254
- last_seq = &seq.back();
3242
+ if (same) {
3243
+ last_seq->length += 1;
3244
+ continue;
3245
+ }
3255
3246
  }
3256
- } else {
3257
- llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id};
3247
+ llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
3258
3248
  seq.push_back(new_seq);
3249
+ last_seq = &seq.back();
3259
3250
  }
3260
3251
  // keep shared prompts first at the end, then sort by length descending.
3261
3252
  std::sort(seq.begin(), seq.end(),
@@ -3419,11 +3410,7 @@ static int llama_get_device_count(const llama_model & model) {
3419
3410
  count += (int) model.rpc_servers.size();
3420
3411
  #endif
3421
3412
 
3422
- #if defined(LM_GGML_USE_SYCL)
3423
- count += lm_ggml_backend_sycl_get_device_count();
3424
- #elif defined(LM_GGML_USE_VULKAN)
3425
- count += lm_ggml_backend_vk_get_device_count();
3426
- #elif defined(LM_GGML_USE_CANN)
3413
+ #if defined(LM_GGML_USE_CANN)
3427
3414
  count += lm_ggml_backend_cann_get_device_count();
3428
3415
  #endif
3429
3416
 
@@ -3444,20 +3431,12 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_m
3444
3431
  }
3445
3432
  }
3446
3433
 
3447
- #if defined(LM_GGML_USE_SYCL)
3448
- if (host_buffer) {
3449
- buft = lm_ggml_backend_sycl_host_buffer_type();
3450
- }
3451
- #elif defined(LM_GGML_USE_CANN)
3434
+ #if defined(LM_GGML_USE_CANN)
3452
3435
  if (host_buffer) {
3453
3436
  buft = lm_ggml_backend_cann_host_buffer_type();
3454
3437
  }
3455
3438
  #elif defined(LM_GGML_USE_CPU_HBM)
3456
3439
  buft = lm_ggml_backend_cpu_hbm_buffer_type();
3457
- #elif defined(LM_GGML_USE_VULKAN)
3458
- if (host_buffer) {
3459
- buft = lm_ggml_backend_vk_host_buffer_type();
3460
- }
3461
3440
  #endif
3462
3441
 
3463
3442
  if (buft == nullptr) {
@@ -3471,25 +3450,12 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_m
3471
3450
  static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
3472
3451
  lm_ggml_backend_buffer_type_t buft = nullptr;
3473
3452
 
3474
- #if defined(LM_GGML_USE_RPC)
3475
- int rpc_count = (int)model.rpc_servers.size();
3476
- if (device < rpc_count) {
3477
- const char * endpoint = model.rpc_servers[device].c_str();
3478
- return lm_ggml_backend_rpc_buffer_type(endpoint);
3479
- }
3480
- device -= rpc_count;
3481
- #endif
3482
-
3483
3453
  if (device < (int)model.devices.size()) {
3484
3454
  return lm_ggml_backend_dev_buffer_type(model.devices[device]);
3485
3455
  }
3486
3456
  device -= (int)model.devices.size();
3487
3457
 
3488
- #if defined(LM_GGML_USE_VULKAN)
3489
- buft = lm_ggml_backend_vk_buffer_type(device);
3490
- #elif defined(LM_GGML_USE_SYCL)
3491
- buft = lm_ggml_backend_sycl_buffer_type(device);
3492
- #elif defined(LM_GGML_USE_KOMPUTE)
3458
+ #if defined(LM_GGML_USE_KOMPUTE)
3493
3459
  buft = lm_ggml_backend_kompute_buffer_type(device);
3494
3460
  #elif defined(LM_GGML_USE_CANN)
3495
3461
  buft = lm_ggml_backend_cann_buffer_type(device);
@@ -3519,12 +3485,6 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
3519
3485
  }
3520
3486
  }
3521
3487
 
3522
- #ifdef LM_GGML_USE_SYCL
3523
- if (lm_ggml_backend_sycl_get_device_count() > 1) {
3524
- buft = lm_ggml_backend_sycl_split_buffer_type(tensor_split);
3525
- }
3526
- #endif
3527
-
3528
3488
  if (buft == nullptr) {
3529
3489
  buft = llama_default_buffer_type_offload(model, fallback_gpu);
3530
3490
  }
@@ -3534,18 +3494,6 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
3534
3494
  }
3535
3495
 
3536
3496
  static size_t llama_get_device_memory(const llama_model & model, int device) {
3537
- #if defined(LM_GGML_USE_RPC)
3538
- int rpc_count = (int)model.rpc_servers.size();
3539
- if (device < rpc_count) {
3540
- size_t total;
3541
- size_t free;
3542
- const char * endpoint = model.rpc_servers[device].c_str();
3543
- lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3544
- return free;
3545
- }
3546
- device = device - rpc_count;
3547
- #endif
3548
-
3549
3497
  if (device < (int)model.devices.size()) {
3550
3498
  lm_ggml_backend_dev_t dev = model.devices[device];
3551
3499
  size_t total;
@@ -3554,17 +3502,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
3554
3502
  return free;
3555
3503
  }
3556
3504
 
3557
- #if defined(LM_GGML_USE_SYCL)
3558
- size_t total;
3559
- size_t free;
3560
- lm_ggml_backend_sycl_get_device_memory(device, &free, &total);
3561
- return free;
3562
- #elif defined(LM_GGML_USE_VULKAN)
3563
- size_t total;
3564
- size_t free;
3565
- lm_ggml_backend_vk_get_device_memory(device, &free, &total);
3566
- return free;
3567
- #elif defined(LM_GGML_USE_CANN)
3505
+ #if defined(LM_GGML_USE_CANN)
3568
3506
  size_t total;
3569
3507
  size_t free;
3570
3508
  lm_ggml_backend_cann_get_device_memory(device, &free, &total);
@@ -3572,6 +3510,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
3572
3510
  #else
3573
3511
  return 1;
3574
3512
  #endif
3513
+
3575
3514
  LM_GGML_UNUSED(model);
3576
3515
  LM_GGML_UNUSED(device);
3577
3516
  }
@@ -6204,14 +6143,14 @@ static void llm_load_vocab(
6204
6143
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
6205
6144
 
6206
6145
  // default special tokens
6207
- vocab.special_bos_id = -1;
6208
- vocab.special_eos_id = -1;
6209
- vocab.special_unk_id = -1;
6210
- vocab.special_sep_id = -1;
6211
- vocab.special_pad_id = -1;
6212
- vocab.special_cls_id = -1;
6213
- vocab.special_mask_id = -1;
6214
- vocab.linefeed_id = -1;
6146
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
6147
+ vocab.special_eos_id = LLAMA_TOKEN_NULL;
6148
+ vocab.special_unk_id = LLAMA_TOKEN_NULL;
6149
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
6150
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
6151
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
6152
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
6153
+ vocab.linefeed_id = LLAMA_TOKEN_NULL;
6215
6154
 
6216
6155
  // read vocab size from metadata
6217
6156
  if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
@@ -6228,16 +6167,16 @@ static void llm_load_vocab(
6228
6167
  vocab.special_bos_id = 1;
6229
6168
  vocab.special_eos_id = 2;
6230
6169
  vocab.special_unk_id = 0;
6231
- vocab.special_sep_id = -1;
6232
- vocab.special_pad_id = -1;
6233
- vocab.special_cls_id = -1;
6234
- vocab.special_mask_id = -1;
6170
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
6171
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
6172
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
6173
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
6235
6174
  } else if (tokenizer_model == "bert") {
6236
6175
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
6237
6176
 
6238
6177
  // default special tokens
6239
- vocab.special_bos_id = -1;
6240
- vocab.special_eos_id = -1;
6178
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
6179
+ vocab.special_eos_id = LLAMA_TOKEN_NULL;
6241
6180
  vocab.special_unk_id = 100;
6242
6181
  vocab.special_sep_id = 102;
6243
6182
  vocab.special_pad_id = 0;
@@ -6273,22 +6212,22 @@ static void llm_load_vocab(
6273
6212
  // default special tokens
6274
6213
  vocab.special_bos_id = 11;
6275
6214
  vocab.special_eos_id = 11;
6276
- vocab.special_unk_id = -1;
6277
- vocab.special_sep_id = -1;
6278
- vocab.special_pad_id = -1;
6279
- vocab.special_cls_id = -1;
6280
- vocab.special_mask_id = -1;
6215
+ vocab.special_unk_id = LLAMA_TOKEN_NULL;
6216
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
6217
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
6218
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
6219
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
6281
6220
  } else if (tokenizer_model == "t5") {
6282
6221
  vocab.type = LLAMA_VOCAB_TYPE_UGM;
6283
6222
 
6284
6223
  // default special tokens
6285
- vocab.special_bos_id = -1;
6224
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
6286
6225
  vocab.special_eos_id = 1;
6287
6226
  vocab.special_unk_id = 2;
6288
- vocab.special_sep_id = -1;
6227
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
6289
6228
  vocab.special_pad_id = 0;
6290
- vocab.special_cls_id = -1;
6291
- vocab.special_mask_id = -1;
6229
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
6230
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
6292
6231
 
6293
6232
  const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
6294
6233
  if (precompiled_charsmap_keyidx != -1) {
@@ -6311,11 +6250,11 @@ static void llm_load_vocab(
6311
6250
  vocab.type = LLAMA_VOCAB_TYPE_RWKV;
6312
6251
 
6313
6252
  // default special tokens
6314
- vocab.special_bos_id = -1;
6315
- vocab.special_eos_id = -1;
6316
- vocab.special_unk_id = -1;
6317
- vocab.special_sep_id = -1;
6318
- vocab.special_pad_id = -1;
6253
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
6254
+ vocab.special_eos_id = LLAMA_TOKEN_NULL;
6255
+ vocab.special_unk_id = LLAMA_TOKEN_NULL;
6256
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
6257
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
6319
6258
  } else {
6320
6259
  throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
6321
6260
  }
@@ -6399,7 +6338,7 @@ static void llm_load_vocab(
6399
6338
  } else if (
6400
6339
  tokenizer_pre == "chatglm-bpe") {
6401
6340
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
6402
- vocab.special_bos_id = -1;
6341
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
6403
6342
  } else if (
6404
6343
  tokenizer_pre == "viking") {
6405
6344
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
@@ -6525,44 +6464,6 @@ static void llm_load_vocab(
6525
6464
 
6526
6465
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
6527
6466
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
6528
- // For Fill-In-the-Middle (FIM)/infill models which where converted
6529
- // prior to support of FIM special tokens in GGUF, the following
6530
- // will allow those models to continue to work. The general names
6531
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
6532
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
6533
- // new versions of these models have been published.
6534
- std::string gen_name;
6535
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
6536
-
6537
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
6538
- [](unsigned char c){ return std::tolower(c); });
6539
-
6540
- if (gen_name.find("code") != std::string::npos) {
6541
- if (model.arch == LLM_ARCH_LLAMA
6542
- && 32010 < vocab.id_to_token.size()
6543
- && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
6544
- && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
6545
- && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
6546
- && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
6547
- vocab.special_prefix_id = 32007;
6548
- vocab.special_suffix_id = 32008;
6549
- vocab.special_middle_id = 32009;
6550
- vocab.special_eot_id = 32010;
6551
- } else if (model.arch == LLM_ARCH_GEMMA
6552
- && 107 < vocab.id_to_token.size()
6553
- && vocab.id_to_token[67].text == "<|fim_prefix|>"
6554
- && vocab.id_to_token[69].text == "<|fim_suffix|>"
6555
- && vocab.id_to_token[68].text == "<|fim_middle|>"
6556
- && vocab.id_to_token[107].text == "<end_of_turn>") {
6557
- vocab.special_prefix_id = 67;
6558
- vocab.special_suffix_id = 69;
6559
- vocab.special_middle_id = 68;
6560
- // TODO: this is not EOT, it is "file separator" token, needs fix
6561
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
6562
- //vocab.special_eot_id = 70;
6563
- vocab.special_eot_id = 107;
6564
- }
6565
- }
6566
6467
  try {
6567
6468
  vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
6568
6469
  } catch (const std::exception & e) {
@@ -6590,18 +6491,26 @@ static void llm_load_vocab(
6590
6491
  // special tokens
6591
6492
  {
6592
6493
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
6593
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
6594
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
6595
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
6596
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
6597
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
6598
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
6599
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
6600
- { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
6601
- { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
6602
- { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
6603
- { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
6604
- { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
6494
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
6495
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
6496
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
6497
+ { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
6498
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
6499
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
6500
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
6501
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
6502
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
6503
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
6504
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
6505
+ { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
6506
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
6507
+ { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
6508
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
6509
+
6510
+ // deprecated
6511
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
6512
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
6513
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
6605
6514
  };
6606
6515
 
6607
6516
  for (const auto & it : special_token_types) {
@@ -6632,46 +6541,140 @@ static void llm_load_vocab(
6632
6541
  }
6633
6542
  }
6634
6543
 
6635
- // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
6636
- //
6637
- // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
6638
- // for now, we apply this workaround to find the EOT token based on its text
6639
- if (vocab.special_eot_id == -1) {
6640
- for (const auto & t : vocab.token_to_id) {
6544
+ // auto-detect special tokens by text
6545
+ // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
6546
+ // for now, we apply this workaround to find the tokens based on their text
6547
+
6548
+ for (const auto & t : vocab.token_to_id) {
6549
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
6550
+ if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
6641
6551
  if (false
6642
- // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
6643
- // need to fix convert script
6644
- //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
6645
6552
  || t.first == "<|eot_id|>"
6646
6553
  || t.first == "<|im_end|>"
6647
6554
  || t.first == "<|end|>"
6648
6555
  || t.first == "<end_of_turn>"
6649
6556
  || t.first == "<|endoftext|>"
6650
6557
  || t.first == "<EOT>"
6558
+ || t.first == "<|end▁of▁sentence|>" // DeepSeek
6651
6559
  ) {
6652
6560
  vocab.special_eot_id = t.second;
6653
6561
  if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6654
- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6655
- __func__, t.first.c_str());
6562
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6563
+ __func__, t.second, t.first.c_str());
6564
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6565
+ }
6566
+ }
6567
+ }
6568
+
6569
+ // find EOM token: "<|eom_id|>"
6570
+ if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
6571
+ if (false
6572
+ || t.first == "<|eom_id|>"
6573
+ ) {
6574
+ vocab.special_eom_id = t.second;
6575
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6576
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6577
+ __func__, t.second, t.first.c_str());
6578
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6579
+ }
6580
+ }
6581
+ }
6582
+
6583
+ // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
6584
+ if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
6585
+ if (false
6586
+ || t.first == "<|fim_prefix|>" // Qwen
6587
+ || t.first == "<fim-prefix>"
6588
+ || t.first == "<|fim▁begin|>" // DeepSeek
6589
+ || t.first == "<PRE>"
6590
+ ) {
6591
+ vocab.special_fim_pre_id = t.second;
6592
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6593
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6594
+ __func__, t.second, t.first.c_str());
6595
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6596
+ }
6597
+ }
6598
+ }
6599
+
6600
+ // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
6601
+ if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
6602
+ if (false
6603
+ || t.first == "<|fim_suffix|>" // Qwen
6604
+ || t.first == "<fim-suffix>"
6605
+ || t.first == "<|fim▁hole|>" // DeepSeek
6606
+ || t.first == "<SUF>"
6607
+ ) {
6608
+ vocab.special_fim_suf_id = t.second;
6609
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6610
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6611
+ __func__, t.second, t.first.c_str());
6612
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6613
+ }
6614
+ }
6615
+ }
6616
+
6617
+ // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
6618
+ if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
6619
+ if (false
6620
+ || t.first == "<|fim_middle|>" // Qwen
6621
+ || t.first == "<fim-middle>"
6622
+ || t.first == "<|fim▁end|>" // DeepSeek
6623
+ || t.first == "<MID>"
6624
+ ) {
6625
+ vocab.special_fim_mid_id = t.second;
6626
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6627
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6628
+ __func__, t.second, t.first.c_str());
6629
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6630
+ }
6631
+ }
6632
+ }
6633
+
6634
+ // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
6635
+ if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
6636
+ if (false
6637
+ || t.first == "<|fim_pad|>" // Qwen
6638
+ || t.first == "<fim-pad>"
6639
+ || t.first == "<PAD>"
6640
+ ) {
6641
+ vocab.special_fim_pad_id = t.second;
6642
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6643
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6644
+ __func__, t.second, t.first.c_str());
6645
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6646
+ }
6647
+ }
6648
+ }
6649
+
6650
+ // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
6651
+ if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
6652
+ if (false
6653
+ || t.first == "<|fim_repo|>" // Qwen
6654
+ || t.first == "<|repo_name|>"
6655
+ || t.first == "<fim-repo>"
6656
+ || t.first == "<REPO>"
6657
+ ) {
6658
+ vocab.special_fim_rep_id = t.second;
6659
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6660
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6661
+ __func__, t.second, t.first.c_str());
6656
6662
  vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6657
6663
  }
6658
- break;
6659
6664
  }
6660
6665
  }
6661
- }
6662
6666
 
6663
- // find EOM token: "<|eom_id|>"
6664
- //
6665
- // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
6666
- // for now, we apply this workaround to find the EOM token based on its text
6667
- if (vocab.special_eom_id == -1) {
6668
- const auto & t = vocab.token_to_id.find("<|eom_id|>");
6669
- if (t != vocab.token_to_id.end()) {
6670
- vocab.special_eom_id = t->second;
6671
- if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6672
- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6673
- __func__, t->first.c_str());
6674
- vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6667
+ // find FIM_SEP token: "<|file_sep|>"
6668
+ if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
6669
+ if (false
6670
+ || t.first == "<|file_sep|>" // Qwen
6671
+ ) {
6672
+ vocab.special_fim_sep_id = t.second;
6673
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6674
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6675
+ __func__, t.second, t.first.c_str());
6676
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6677
+ }
6675
6678
  }
6676
6679
  }
6677
6680
  }
@@ -6680,6 +6683,19 @@ static void llm_load_vocab(
6680
6683
  // this is currently determined based on the token text, which is obviously not ideal
6681
6684
  // ref: https://github.com/ggerganov/llama.cpp/issues/9606
6682
6685
  vocab.special_eog_ids.clear();
6686
+
6687
+ if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
6688
+ vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
6689
+ }
6690
+
6691
+ if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
6692
+ vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
6693
+ }
6694
+
6695
+ if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
6696
+ vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
6697
+ }
6698
+
6683
6699
  for (const auto & t : vocab.token_to_id) {
6684
6700
  if (false
6685
6701
  || t.first == "<|eot_id|>"
@@ -6692,24 +6708,31 @@ static void llm_load_vocab(
6692
6708
  ) {
6693
6709
  vocab.special_eog_ids.insert(t.second);
6694
6710
  if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6695
- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6696
- __func__, t.first.c_str());
6711
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6712
+ __func__, t.second, t.first.c_str());
6697
6713
  vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
6698
6714
  }
6715
+ } else {
6716
+ // token is control, but not marked as EOG -> print a debug log
6717
+ if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
6718
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
6719
+ __func__, t.second, t.first.c_str());
6720
+ }
6699
6721
  }
6700
6722
  }
6701
6723
 
6702
- if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
6724
+ // sanity checks
6725
+ if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
6703
6726
  vocab.special_eog_ids.insert(vocab.special_eos_id);
6704
6727
  LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6705
6728
  }
6706
6729
 
6707
- if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
6730
+ if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
6708
6731
  vocab.special_eog_ids.insert(vocab.special_eot_id);
6709
6732
  LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6710
6733
  }
6711
6734
 
6712
- if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
6735
+ if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
6713
6736
  vocab.special_eog_ids.insert(vocab.special_eom_id);
6714
6737
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
6715
6738
  }
@@ -6903,20 +6926,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6903
6926
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
6904
6927
 
6905
6928
  // special tokens
6906
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
6907
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
6908
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
6909
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
6910
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
6911
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
6912
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
6913
-
6914
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
6915
- if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
6916
- if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
6917
- if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
6918
- if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
6919
- if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
6929
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
6930
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
6931
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
6932
+ if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
6933
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
6934
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
6935
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
6936
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
6937
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
6938
+
6939
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
6940
+
6941
+ if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
6942
+ if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
6943
+ if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
6944
+ if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
6945
+ if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
6946
+ if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
6920
6947
 
6921
6948
  for (const auto & id : vocab.special_eog_ids) {
6922
6949
  LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@@ -6982,7 +7009,14 @@ static bool llm_load_tensors(
6982
7009
 
6983
7010
  // assign cpu layers
6984
7011
  for (int i = 0; i < i_gpu_start; ++i) {
7012
+ #ifdef LM_GGML_USE_AMX
7013
+ model.buft_layer[i] = {
7014
+ lm_ggml_backend_amx_buffer_type(),
7015
+ llama_default_buffer_type_cpu(model, true)
7016
+ };
7017
+ #else
6985
7018
  model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
7019
+ #endif
6986
7020
  }
6987
7021
 
6988
7022
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -16031,9 +16065,11 @@ struct llm_build_context {
16031
16065
  cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
16032
16066
 
16033
16067
  cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
16034
- cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
16068
+ cb(cur, "result_norm", -1);
16035
16069
 
16070
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
16036
16071
  cb(cur, "result_output", -1);
16072
+
16037
16073
  lm_ggml_build_forward_expand(gf, cur);
16038
16074
 
16039
16075
  return gf;
@@ -17083,10 +17119,10 @@ static void llama_graph_compute(
17083
17119
  //
17084
17120
  static int llama_decode_internal(
17085
17121
  llama_context & lctx,
17086
- llama_batch batch_all) { // TODO: rename back to batch
17122
+ llama_batch batch) {
17087
17123
 
17088
17124
  lctx.is_encoding = false;
17089
- const uint32_t n_tokens_all = batch_all.n_tokens;
17125
+ const uint32_t n_tokens_all = batch.n_tokens;
17090
17126
 
17091
17127
  if (n_tokens_all == 0) {
17092
17128
  LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -17097,12 +17133,12 @@ static int llama_decode_internal(
17097
17133
  const auto & hparams = model.hparams;
17098
17134
  const auto & cparams = lctx.cparams;
17099
17135
 
17100
- LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
17136
+ LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
17101
17137
 
17102
- if (batch_all.token) {
17138
+ if (batch.token) {
17103
17139
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
17104
- if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
17105
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
17140
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
17141
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
17106
17142
  return -1;
17107
17143
  }
17108
17144
  }
@@ -17133,9 +17169,9 @@ static int llama_decode_internal(
17133
17169
  lctx.embd_seq.clear();
17134
17170
 
17135
17171
  // count outputs
17136
- if (batch_all.logits && !embd_pooled) {
17172
+ if (batch.logits && !embd_pooled) {
17137
17173
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
17138
- n_outputs += batch_all.logits[i] != 0;
17174
+ n_outputs += batch.logits[i] != 0;
17139
17175
  }
17140
17176
  } else if (lctx.logits_all || embd_pooled) {
17141
17177
  n_outputs = n_tokens_all;
@@ -17144,7 +17180,7 @@ static int llama_decode_internal(
17144
17180
  n_outputs = 1;
17145
17181
  }
17146
17182
 
17147
- lctx.sbatch.from_batch(batch_all, n_embd,
17183
+ lctx.sbatch.from_batch(batch, n_embd,
17148
17184
  /* simple_split */ !kv_self.recurrent,
17149
17185
  /* logits_all */ n_outputs == n_tokens_all);
17150
17186
 
@@ -19029,16 +19065,20 @@ bool llama_supports_mlock(void) {
19029
19065
  }
19030
19066
 
19031
19067
  bool llama_supports_gpu_offload(void) {
19032
- #if defined(LM_GGML_USE_VULKAN) || \
19033
- defined(LM_GGML_USE_SYCL) || defined(LM_GGML_USE_KOMPUTE) || defined(LM_GGML_USE_RPC)
19068
+ #if defined(LM_GGML_USE_KOMPUTE)
19034
19069
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
19035
19070
  return true;
19036
19071
  #else
19037
19072
  return lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
19038
- lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
19073
+ lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
19074
+ llama_supports_rpc();
19039
19075
  #endif
19040
19076
  }
19041
19077
 
19078
+ bool llama_supports_rpc(void) {
19079
+ return lm_ggml_backend_reg_by_name("RPC") != nullptr;
19080
+ }
19081
+
19042
19082
  void llama_backend_init(void) {
19043
19083
  lm_ggml_time_init();
19044
19084
 
@@ -19113,6 +19153,36 @@ struct llama_model * llama_load_model_from_file(
19113
19153
  model->rpc_servers.push_back(servers);
19114
19154
  }
19115
19155
 
19156
+ // add RPC devices
19157
+ if (!model->rpc_servers.empty()) {
19158
+ lm_ggml_backend_reg_t rpc_reg = lm_ggml_backend_reg_by_name("RPC");
19159
+ if (!rpc_reg) {
19160
+ LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
19161
+ llama_free_model(model);
19162
+ return nullptr;
19163
+ }
19164
+
19165
+ // lm_ggml_backend_dev_t lm_ggml_backend_rpc_add_device(const char * endpoint);
19166
+ using lm_ggml_backend_rpc_add_device_t = lm_ggml_backend_dev_t (*)(const char *);
19167
+ lm_ggml_backend_rpc_add_device_t lm_ggml_backend_rpc_add_device_fn = (lm_ggml_backend_rpc_add_device_t) lm_ggml_backend_reg_get_proc_address(rpc_reg, "lm_ggml_backend_rpc_add_device");
19168
+ if (!lm_ggml_backend_rpc_add_device_fn) {
19169
+ LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
19170
+ llama_free_model(model);
19171
+ return nullptr;
19172
+ }
19173
+
19174
+ for (const std::string & server : model->rpc_servers) {
19175
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_rpc_add_device_fn(server.c_str());
19176
+ if (dev) {
19177
+ model->devices.push_back(dev);
19178
+ } else {
19179
+ LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
19180
+ llama_free_model(model);
19181
+ return nullptr;
19182
+ }
19183
+ }
19184
+ }
19185
+
19116
19186
  // create list of devices to use with this model
19117
19187
  // currently, we use all available devices
19118
19188
  // TODO: rework API to give user more control over device selection
@@ -19126,8 +19196,13 @@ struct llama_model * llama_load_model_from_file(
19126
19196
 
19127
19197
  case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
19128
19198
  case LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
19199
+ {
19200
+ size_t free, total; // NOLINT
19201
+ lm_ggml_backend_dev_memory(dev, &free, &total);
19202
+ LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, lm_ggml_backend_dev_name(dev), lm_ggml_backend_dev_description(dev), free/1024/1024);
19129
19203
  model->devices.push_back(dev);
19130
19204
  break;
19205
+ }
19131
19206
  }
19132
19207
  }
19133
19208
 
@@ -19139,7 +19214,7 @@ struct llama_model * llama_load_model_from_file(
19139
19214
  } else if (status == -2) {
19140
19215
  LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
19141
19216
  }
19142
- delete model;
19217
+ llama_free_model(model);
19143
19218
  return nullptr;
19144
19219
  }
19145
19220
 
@@ -19179,7 +19254,7 @@ struct llama_context * llama_new_context_with_model(
19179
19254
  params.flash_attn = false;
19180
19255
  }
19181
19256
 
19182
- if (params.type_v != LM_GGML_TYPE_F16 && !params.flash_attn) {
19257
+ if (lm_ggml_is_quantized(params.type_v) && !params.flash_attn) {
19183
19258
  LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
19184
19259
  return nullptr;
19185
19260
  }
@@ -19322,71 +19397,7 @@ struct llama_context * llama_new_context_with_model(
19322
19397
  main_gpu -= (int)model->devices.size();
19323
19398
  }
19324
19399
 
19325
- #if defined(LM_GGML_USE_RPC)
19326
- if (model->n_gpu_layers > 0) {
19327
- for (const auto & endpoint : model->rpc_servers) {
19328
- lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
19329
- if (backend == nullptr) {
19330
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
19331
- llama_free(ctx);
19332
- return nullptr;
19333
- }
19334
- ctx->backends.push_back(backend);
19335
- }
19336
- }
19337
- if (main_gpu >= (int)model->rpc_servers.size()) {
19338
- main_gpu -= (int)model->rpc_servers.size();
19339
- }
19340
- #endif
19341
-
19342
- #if defined(LM_GGML_USE_VULKAN)
19343
- if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19344
- LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
19345
- llama_free(ctx);
19346
- return nullptr;
19347
- }
19348
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19349
- lm_ggml_backend_t backend = lm_ggml_backend_vk_init(main_gpu);
19350
- if (backend == nullptr) {
19351
- LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
19352
- llama_free(ctx);
19353
- return nullptr;
19354
- }
19355
- ctx->backends.push_back(backend);
19356
- } else {
19357
- for (int device = 0; device < lm_ggml_backend_vk_get_device_count(); ++device) {
19358
- lm_ggml_backend_t backend = lm_ggml_backend_vk_init(device);
19359
- if (backend == nullptr) {
19360
- LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
19361
- llama_free(ctx);
19362
- return nullptr;
19363
- }
19364
- ctx->backends.push_back(backend);
19365
- }
19366
- }
19367
- #elif defined(LM_GGML_USE_SYCL)
19368
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19369
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19370
- lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(main_gpu);
19371
- if (backend == nullptr) {
19372
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
19373
- llama_free(ctx);
19374
- return nullptr;
19375
- }
19376
- ctx->backends.push_back(backend);
19377
- } else {
19378
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
19379
- for (int i = 0; i < lm_ggml_backend_sycl_get_device_count(); ++i) {
19380
- lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(i);
19381
- if (backend == nullptr) {
19382
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
19383
- llama_free(ctx);
19384
- return nullptr;
19385
- }
19386
- ctx->backends.push_back(backend);
19387
- }
19388
- }
19389
- #elif defined(LM_GGML_USE_KOMPUTE)
19400
+ #if defined(LM_GGML_USE_KOMPUTE)
19390
19401
  if (model->n_gpu_layers > 0) {
19391
19402
  auto * backend = lm_ggml_backend_kompute_init(main_gpu);
19392
19403
  if (backend == nullptr) {
@@ -19475,7 +19486,7 @@ struct llama_context * llama_new_context_with_model(
19475
19486
  }
19476
19487
 
19477
19488
  LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
19478
- (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
19489
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
19479
19490
  lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
19480
19491
  lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
19481
19492
  }
@@ -21071,9 +21082,7 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
21071
21082
 
21072
21083
  struct llama_batch llama_batch_get_one(
21073
21084
  llama_token * tokens,
21074
- int32_t n_tokens,
21075
- llama_pos pos_0,
21076
- llama_seq_id seq_id) {
21085
+ int32_t n_tokens) {
21077
21086
  return {
21078
21087
  /*n_tokens =*/ n_tokens,
21079
21088
  /*tokens =*/ tokens,
@@ -21082,9 +21091,6 @@ struct llama_batch llama_batch_get_one(
21082
21091
  /*n_seq_id =*/ nullptr,
21083
21092
  /*seq_id =*/ nullptr,
21084
21093
  /*logits =*/ nullptr,
21085
- /*all_pos_0 =*/ pos_0,
21086
- /*all_pos_1 =*/ 1,
21087
- /*all_seq_id =*/ seq_id,
21088
21094
  };
21089
21095
  }
21090
21096
 
@@ -21097,9 +21103,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
21097
21103
  /*n_seq_id =*/ nullptr,
21098
21104
  /*seq_id =*/ nullptr,
21099
21105
  /*logits =*/ nullptr,
21100
- /*all_pos_0 =*/ 0,
21101
- /*all_pos_1 =*/ 0,
21102
- /*all_seq_id =*/ 0,
21103
21106
  };
21104
21107
 
21105
21108
  if (embd) {
@@ -21135,11 +21138,62 @@ void llama_batch_free(struct llama_batch batch) {
21135
21138
  if (batch.logits) free(batch.logits);
21136
21139
  }
21137
21140
 
21141
+ // temporary allocate memory for the input batch if needed
21142
+ static const llama_seq_id batch_default_seq_id = 0;
21143
+ struct llama_batch_allocr {
21144
+ std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
21145
+ std::vector<llama_pos> pos;
21146
+ std::vector<int32_t> n_seq_id;
21147
+ std::vector<llama_seq_id *> seq_id;
21148
+ std::vector<int8_t> logits;
21149
+ struct llama_batch batch;
21150
+ // optionally fulfill the batch returned by llama_batch_get_one
21151
+ llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
21152
+ batch = in_batch;
21153
+ if (!batch.pos) {
21154
+ // determine the last position in KV cache
21155
+ llama_pos last_pos = -1;
21156
+ for (const auto & cell : ctx->kv_self.cells) {
21157
+ if (cell.has_seq_id(batch_default_seq_id)) {
21158
+ last_pos = std::max(last_pos, cell.pos);
21159
+ }
21160
+ }
21161
+ last_pos++; // next position
21162
+ pos.resize(batch.n_tokens);
21163
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
21164
+ pos[i] = i+last_pos;
21165
+ }
21166
+ batch.pos = pos.data();
21167
+ }
21168
+ if (!batch.n_seq_id) {
21169
+ n_seq_id.resize(batch.n_tokens);
21170
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
21171
+ n_seq_id[i] = seq_id_0.size();
21172
+ }
21173
+ batch.n_seq_id = n_seq_id.data();
21174
+ }
21175
+ if (!batch.seq_id) {
21176
+ seq_id.resize(batch.n_tokens + 1);
21177
+ seq_id[batch.n_tokens] = NULL;
21178
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
21179
+ seq_id[i] = seq_id_0.data();
21180
+ }
21181
+ batch.seq_id = seq_id.data();
21182
+ }
21183
+ if (!batch.logits) {
21184
+ logits.resize(batch.n_tokens);
21185
+ logits[logits.size() - 1] = true;
21186
+ batch.logits = logits.data();
21187
+ }
21188
+ }
21189
+ };
21190
+
21138
21191
  int32_t llama_encode(
21139
21192
  struct llama_context * ctx,
21140
21193
  struct llama_batch batch) {
21141
- const int ret = llama_encode_internal(*ctx, batch);
21142
- if (ret < 0) {
21194
+ llama_batch_allocr batch_allocr(ctx, batch);
21195
+ const int ret = llama_encode_internal(*ctx, batch_allocr.batch);
21196
+ if (ret != 0) {
21143
21197
  LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
21144
21198
  }
21145
21199
 
@@ -21149,8 +21203,9 @@ int32_t llama_encode(
21149
21203
  int32_t llama_decode(
21150
21204
  struct llama_context * ctx,
21151
21205
  struct llama_batch batch) {
21152
- const int ret = llama_decode_internal(*ctx, batch);
21153
- if (ret < 0) {
21206
+ llama_batch_allocr batch_allocr(ctx, batch);
21207
+ const int ret = llama_decode_internal(*ctx, batch_allocr.batch);
21208
+ if (ret != 0) {
21154
21209
  LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
21155
21210
  }
21156
21211
 
@@ -21329,6 +21384,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
21329
21384
  return llama_token_eos_impl(model->vocab);
21330
21385
  }
21331
21386
 
21387
+ llama_token llama_token_eot(const struct llama_model * model) {
21388
+ return llama_token_eot_impl(model->vocab);
21389
+ }
21390
+
21332
21391
  llama_token llama_token_cls(const struct llama_model * model) {
21333
21392
  return llama_token_cls_impl(model->vocab);
21334
21393
  }
@@ -21365,8 +21424,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
21365
21424
  return llama_token_suffix_impl(model->vocab);
21366
21425
  }
21367
21426
 
21368
- llama_token llama_token_eot(const struct llama_model * model) {
21369
- return llama_token_eot_impl(model->vocab);
21427
+ llama_token llama_token_fim_pre(const struct llama_model * model) {
21428
+ return llama_token_fim_pre_impl(model->vocab);
21429
+ }
21430
+
21431
+ llama_token llama_token_fim_suf(const struct llama_model * model) {
21432
+ return llama_token_fim_suf_impl(model->vocab);
21433
+ }
21434
+
21435
+ llama_token llama_token_fim_mid(const struct llama_model * model) {
21436
+ return llama_token_fim_mid_impl(model->vocab);
21437
+ }
21438
+
21439
+ llama_token llama_token_fim_pad(const struct llama_model * model) {
21440
+ return llama_token_fim_pad_impl(model->vocab);
21441
+ }
21442
+
21443
+ llama_token llama_token_fim_rep(const struct llama_model * model) {
21444
+ return llama_token_fim_rep_impl(model->vocab);
21445
+ }
21446
+
21447
+ llama_token llama_token_fim_sep(const struct llama_model * model) {
21448
+ return llama_token_fim_sep_impl(model->vocab);
21370
21449
  }
21371
21450
 
21372
21451
  //
@@ -21724,6 +21803,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
21724
21803
  return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
21725
21804
  }
21726
21805
 
21806
+ struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
21807
+ return llama_sampler_init_infill_impl(model->vocab);
21808
+ }
21809
+
21727
21810
  //
21728
21811
  // model split
21729
21812
  //
@@ -21763,6 +21846,7 @@ const char * llama_print_system_info(void) {
21763
21846
  s += "AVX512_VBMI = " + std::to_string(lm_ggml_cpu_has_avx512_vbmi()) + " | ";
21764
21847
  s += "AVX512_VNNI = " + std::to_string(lm_ggml_cpu_has_avx512_vnni()) + " | ";
21765
21848
  s += "AVX512_BF16 = " + std::to_string(lm_ggml_cpu_has_avx512_bf16()) + " | ";
21849
+ s += "AMX_INT8 = " + std::to_string(lm_ggml_cpu_has_amx_int8()) + " | ";
21766
21850
  s += "FMA = " + std::to_string(lm_ggml_cpu_has_fma()) + " | ";
21767
21851
  s += "NEON = " + std::to_string(lm_ggml_cpu_has_neon()) + " | ";
21768
21852
  s += "SVE = " + std::to_string(lm_ggml_cpu_has_sve()) + " | ";