llama_cpp 0.14.7 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,7 @@
75
75
  #include <forward_list>
76
76
  #include <fstream>
77
77
  #include <functional>
78
+ #include <future>
78
79
  #include <initializer_list>
79
80
  #include <locale>
80
81
  #include <map>
@@ -107,7 +108,6 @@
107
108
  #define LLAMA_MAX_NODES 8192
108
109
  #define LLAMA_MAX_EXPERTS 60
109
110
 
110
-
111
111
  //
112
112
  // logging
113
113
  //
@@ -316,6 +316,7 @@ enum llm_kv {
316
316
  LLM_KV_SSM_TIME_STEP_RANK,
317
317
 
318
318
  LLM_KV_TOKENIZER_MODEL,
319
+ LLM_KV_TOKENIZER_PRE,
319
320
  LLM_KV_TOKENIZER_LIST,
320
321
  LLM_KV_TOKENIZER_TOKEN_TYPE,
321
322
  LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
392
393
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
393
394
 
394
395
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
396
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
395
397
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
396
398
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
397
399
  { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@@ -1843,7 +1845,7 @@ struct llama_hparams {
1843
1845
  float f_logit_scale = 0.0f;
1844
1846
 
1845
1847
  bool causal_attn = true;
1846
- bool need_kq_pos = false;
1848
+ bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1847
1849
 
1848
1850
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1849
1851
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1933,6 +1935,7 @@ struct llama_cparams {
1933
1935
  bool embeddings;
1934
1936
  bool causal_attn;
1935
1937
  bool offload_kqv;
1938
+ bool flash_attn;
1936
1939
 
1937
1940
  enum llama_pooling_type pooling_type;
1938
1941
 
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
2036
2039
  bool has_shift = false;
2037
2040
  bool do_defrag = false;
2038
2041
  bool do_copy = false;
2039
- // with recurrent state models, a cell can hold the state for more than one past token
2040
- bool recurrent = false;
2042
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
2043
+ bool v_trans = true; // the value tensor is transposed
2041
2044
 
2042
2045
  // Note: The value of head isn't only used to optimize searching
2043
2046
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2114,7 +2117,8 @@ struct llama_vocab {
2114
2117
  ttype type;
2115
2118
  };
2116
2119
 
2117
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2120
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2121
+ enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2118
2122
 
2119
2123
  std::unordered_map<token, id> token_to_id;
2120
2124
  std::vector<token_data> id_to_token;
@@ -2335,11 +2339,14 @@ struct llama_context {
2335
2339
 
2336
2340
  static bool llama_kv_cache_init(
2337
2341
  struct llama_kv_cache & cache,
2338
- const llama_model & model,
2342
+ const llama_context * ctx,
2339
2343
  ggml_type type_k,
2340
2344
  ggml_type type_v,
2341
2345
  uint32_t kv_size,
2342
2346
  bool offload) {
2347
+ const llama_model & model = ctx->model;
2348
+ const llama_cparams & cparams = ctx->cparams;
2349
+
2343
2350
  const struct llama_hparams & hparams = model.hparams;
2344
2351
 
2345
2352
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
2350
2357
 
2351
2358
  // TODO: find a nicer way to add other recurrent model architectures
2352
2359
  cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2360
+ cache.v_trans = !cparams.flash_attn;
2353
2361
 
2354
- // TODO: support mixed reccurent Transformer architectues
2362
+ // TODO: support mixed recurrent Transformer architectures
2355
2363
  // NOTE: (!a || b) is a logical implication (a -> b)
2356
2364
  GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2357
2365
  GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2562
2570
  }
2563
2571
  cache.head = 0;
2564
2572
  cache.used = 0;
2573
+
2574
+ for (auto & buf : cache.bufs) {
2575
+ ggml_backend_buffer_clear(buf, 0);
2576
+ }
2565
2577
  }
2566
2578
 
2567
2579
  static bool llama_kv_cache_seq_rm(
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
2882
2894
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2883
2895
  case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2884
2896
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2897
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
2885
2898
  }
2886
2899
  return "unknown";
2887
2900
  }
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
2893
2906
  __func__, override_type_to_str(ovrd->tag), ovrd->key);
2894
2907
  switch (ovrd->tag) {
2895
2908
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2896
- LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2909
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
2897
2910
  } break;
2898
2911
  case LLAMA_KV_OVERRIDE_TYPE_INT: {
2899
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2912
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
2900
2913
  } break;
2901
2914
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2902
- LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2915
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
2916
+ } break;
2917
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
2918
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
2903
2919
  } break;
2904
2920
  default:
2905
2921
  // Shouldn't be possible to end up here, but just in case...
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
2918
2934
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2919
2935
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2920
2936
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2921
- target = ovrd->bool_value;
2937
+ target = ovrd->val_bool;
2922
2938
  return true;
2923
2939
  }
2924
2940
  return false;
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
2928
2944
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2929
2945
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2930
2946
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2931
- target = ovrd->int_value;
2947
+ target = ovrd->val_i64;
2932
2948
  return true;
2933
2949
  }
2934
2950
  return false;
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
2938
2954
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2939
2955
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2940
2956
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2941
- target = ovrd->float_value;
2957
+ target = ovrd->val_f64;
2942
2958
  return true;
2943
2959
  }
2944
2960
  return false;
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
2947
2963
  template<typename OT>
2948
2964
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2949
2965
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2950
- (void)target;
2951
- (void)ovrd;
2952
- if (!ovrd) { return false; }
2953
- // Currently, we should never end up here so it would be a bug if we do.
2954
- throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2955
- ovrd ? ovrd->key : "NULL"));
2966
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
2967
+ target = ovrd->val_str;
2968
+ return true;
2969
+ }
2970
+ return false;
2956
2971
  }
2957
2972
 
2958
2973
  static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
2985
3000
  size_t n_bytes = 0;
2986
3001
 
2987
3002
  bool use_mmap = false;
3003
+ bool check_tensors;
2988
3004
 
2989
3005
  llama_files files;
2990
3006
  llama_ftype ftype;
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
3018
3034
  std::string arch_name;
3019
3035
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
3020
3036
 
3021
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
3037
+ llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
3022
3038
  int trace = 0;
3023
3039
  if (getenv("LLAMA_TRACE")) {
3024
3040
  trace = atoi(getenv("LLAMA_TRACE"));
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
3115
3131
 
3116
3132
  fver = (enum llama_fver) gguf_get_version(meta);
3117
3133
 
3134
+ std::set<std::string> tensor_names;
3118
3135
  for (auto & w : weights) {
3119
3136
  n_elements += ggml_nelements(w.tensor);
3120
3137
  n_bytes += ggml_nbytes(w.tensor);
3138
+ // make sure there is no duplicated tensor names
3139
+ const std::string name(w.tensor->name);
3140
+ auto found = tensor_names.find(name);
3141
+ if (found != tensor_names.end()) {
3142
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
3143
+ }
3144
+ tensor_names.insert(name);
3121
3145
  }
3122
3146
 
3123
3147
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -3151,6 +3175,7 @@ struct llama_model_loader {
3151
3175
  switch (type_max) {
3152
3176
  case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
3153
3177
  case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3178
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
3154
3179
  case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
3155
3180
  case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
3156
3181
  case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3223,6 +3248,7 @@ struct llama_model_loader {
3223
3248
  }
3224
3249
 
3225
3250
  this->use_mmap = use_mmap;
3251
+ this->check_tensors = check_tensors;
3226
3252
  }
3227
3253
 
3228
3254
  ~llama_model_loader() {
@@ -3481,6 +3507,10 @@ struct llama_model_loader {
3481
3507
  file->seek(w.offs, SEEK_SET);
3482
3508
  file->read_raw(cur->data, ggml_nbytes(cur));
3483
3509
  }
3510
+
3511
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
3512
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3513
+ }
3484
3514
  }
3485
3515
 
3486
3516
  size_t size_done = 0;
@@ -3497,6 +3527,8 @@ struct llama_model_loader {
3497
3527
  GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3498
3528
 
3499
3529
  std::vector<no_init<uint8_t>> read_buf;
3530
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3531
+
3500
3532
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3501
3533
  const auto * weight = get_weight(ggml_get_name(cur));
3502
3534
  if (weight == nullptr) {
@@ -3518,37 +3550,66 @@ struct llama_model_loader {
3518
3550
  if (bufs_mmap.count(weight->idx)) {
3519
3551
  buf_mmap = bufs_mmap.at(weight->idx);
3520
3552
  }
3553
+ uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
3554
+
3555
+ if (check_tensors) {
3556
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
3557
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
3558
+ }));
3559
+ }
3560
+
3521
3561
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3522
3562
  if (buf_mmap && cur->data == nullptr) {
3523
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3563
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
3524
3564
  if (lmlocks) {
3525
3565
  const auto & lmlock = lmlocks->at(weight->idx);
3526
- lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3566
+ lmlock->grow_to(weight->offs + n_size);
3527
3567
  }
3528
3568
 
3529
3569
  auto & mmap_used = mmaps_used[weight->idx];
3530
3570
  mmap_used.first = std::min(mmap_used.first, weight->offs);
3531
3571
  mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3532
3572
  } else {
3533
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3573
+ ggml_backend_tensor_set(cur, data, 0, n_size);
3534
3574
  }
3535
3575
  } else {
3536
3576
  GGML_ASSERT(weight->idx < files.size());
3537
3577
  const auto & file = files.at(weight->idx);
3538
3578
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3539
3579
  file->seek(weight->offs, SEEK_SET);
3540
- file->read_raw(cur->data, ggml_nbytes(cur));
3580
+ file->read_raw(cur->data, n_size);
3581
+ if (check_tensors) {
3582
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3583
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
3584
+ }));
3585
+ }
3541
3586
  } else {
3542
- read_buf.resize(ggml_nbytes(cur));
3587
+ read_buf.resize(n_size);
3543
3588
  file->seek(weight->offs, SEEK_SET);
3544
- file->read_raw(read_buf.data(), ggml_nbytes(cur));
3589
+ file->read_raw(read_buf.data(), n_size);
3545
3590
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3591
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3592
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3593
+ }
3546
3594
  }
3547
3595
  }
3548
3596
 
3549
3597
  size_done += n_size;
3550
3598
  }
3551
3599
 
3600
+ // check validation results
3601
+ bool validation_failed = false;
3602
+ for (auto & future : validation_result) {
3603
+ auto result = future.get();
3604
+ if (!result.second) {
3605
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
3606
+ validation_failed = true;
3607
+ }
3608
+ }
3609
+ if (validation_failed) {
3610
+ throw std::runtime_error("found tensors with invalid data");
3611
+ }
3612
+
3552
3613
  // check if this is the last call and do final cleanup
3553
3614
  if (size_done >= size_data) {
3554
3615
  // unmap offloaded tensors and metadata
@@ -3606,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3606
3667
  switch (ftype) {
3607
3668
  case LLAMA_FTYPE_ALL_F32: return "all F32";
3608
3669
  case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3670
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
3609
3671
  case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
3610
3672
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
3611
3673
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -4142,7 +4204,7 @@ static void llm_load_hparams(
4142
4204
  model.ftype = ml.ftype;
4143
4205
 
4144
4206
  if (hparams.f_max_alibi_bias > 0.0f) {
4145
- hparams.need_kq_pos = true;
4207
+ hparams.use_alibi = true;
4146
4208
  }
4147
4209
 
4148
4210
  hparams.rope_type = llama_rope_type(&model);
@@ -4165,11 +4227,13 @@ static void llm_load_vocab(
4165
4227
 
4166
4228
  // determine vocab type
4167
4229
  {
4168
- std::string tokenizer_name;
4230
+ std::string tokenizer_model;
4231
+ std::string tokenizer_pre;
4169
4232
 
4170
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
4233
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
4234
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4171
4235
 
4172
- if (tokenizer_name == "no_vocab") {
4236
+ if (tokenizer_model == "no_vocab") {
4173
4237
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
4174
4238
 
4175
4239
  // default special tokens
@@ -4183,7 +4247,7 @@ static void llm_load_vocab(
4183
4247
  vocab.linefeed_id = -1;
4184
4248
 
4185
4249
  return;
4186
- } else if (tokenizer_name == "llama") {
4250
+ } else if (tokenizer_model == "llama") {
4187
4251
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4188
4252
 
4189
4253
  // default special tokens
@@ -4228,9 +4292,27 @@ static void llm_load_vocab(
4228
4292
  if (add_space_prefix_keyidx != -1) {
4229
4293
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4230
4294
  } // The default value of add_space_prefix is true.
4231
- } else if (tokenizer_name == "gpt2") {
4232
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4295
+ } else if (tokenizer_model == "bert") {
4296
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4233
4297
 
4298
+ // default special tokens
4299
+ vocab.special_bos_id = -1;
4300
+ vocab.special_eos_id = -1;
4301
+ vocab.special_unk_id = 100;
4302
+ vocab.special_sep_id = 102;
4303
+ vocab.special_pad_id = 0;
4304
+ vocab.special_cls_id = 101;
4305
+ vocab.special_mask_id = 103;
4306
+ vocab.add_space_prefix = false;
4307
+ } else {
4308
+ if (tokenizer_model == "gpt2") {
4309
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4310
+ } else {
4311
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4312
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4313
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
4314
+ return;
4315
+ }
4234
4316
  // read bpe merges and populate bpe ranks
4235
4317
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4236
4318
  if (merges_keyidx == -1) {
@@ -4264,23 +4346,65 @@ static void llm_load_vocab(
4264
4346
  vocab.special_pad_id = -1;
4265
4347
  vocab.special_cls_id = -1;
4266
4348
  vocab.special_mask_id = -1;
4267
- } else if (tokenizer_name == "bert") {
4268
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4349
+ }
4269
4350
 
4270
- // default special tokens
4271
- vocab.special_bos_id = -1;
4272
- vocab.special_eos_id = -1;
4273
- vocab.special_unk_id = 100;
4274
- vocab.special_sep_id = 102;
4275
- vocab.special_pad_id = 0;
4276
- vocab.special_cls_id = 101;
4277
- vocab.special_mask_id = 103;
4278
- vocab.add_space_prefix = false;
4351
+ // for now, only BPE models have pre-tokenizers
4352
+ if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4353
+ if (tokenizer_pre.empty()) {
4354
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4355
+ LLAMA_LOG_WARN("%s: \n", __func__);
4356
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4357
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4358
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4359
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4360
+ LLAMA_LOG_WARN("%s: \n", __func__);
4361
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4362
+ } else if (
4363
+ tokenizer_pre == "default") {
4364
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4365
+ } else if (
4366
+ tokenizer_pre == "llama3" ||
4367
+ tokenizer_pre == "llama-v3" ||
4368
+ tokenizer_pre == "llama-bpe") {
4369
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4370
+ } else if (
4371
+ tokenizer_pre == "deepseek-llm") {
4372
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4373
+ } else if (
4374
+ tokenizer_pre == "deepseek-coder") {
4375
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4376
+ } else if (
4377
+ tokenizer_pre == "falcon") {
4378
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4379
+ } else if (
4380
+ tokenizer_pre == "mpt") {
4381
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4382
+ } else if (
4383
+ tokenizer_pre == "starcoder") {
4384
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
+ } else if (
4386
+ tokenizer_pre == "gpt-2") {
4387
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
+ } else if (
4389
+ tokenizer_pre == "refact") {
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
4391
+ } else if (
4392
+ tokenizer_pre == "command-r") {
4393
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
4394
+ } else if (
4395
+ tokenizer_pre == "qwen2") {
4396
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4397
+ } else if (
4398
+ tokenizer_pre == "olmo") {
4399
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4400
+ } else if (
4401
+ tokenizer_pre == "dbrx") {
4402
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4403
+ } else {
4404
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4405
+ }
4279
4406
  } else {
4280
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4281
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4282
-
4283
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4407
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4284
4408
  }
4285
4409
  }
4286
4410
 
@@ -5975,7 +6099,7 @@ static bool llm_load_tensors(
5975
6099
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
5976
6100
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
5977
6101
  try {
5978
- llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
6102
+ llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
5979
6103
 
5980
6104
  model.hparams.vocab_only = params.vocab_only;
5981
6105
 
@@ -6013,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6013
6137
  || !(
6014
6138
  model.ftype == LLAMA_FTYPE_ALL_F32 ||
6015
6139
  model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
6140
+ model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
6016
6141
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
6017
6142
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
6018
6143
  )
@@ -6104,37 +6229,47 @@ static struct ggml_tensor * llm_build_inp_embd(
6104
6229
  static void llm_build_kv_store(
6105
6230
  struct ggml_context * ctx,
6106
6231
  const llama_hparams & hparams,
6232
+ const llama_cparams & cparams,
6107
6233
  const llama_kv_cache & kv,
6108
6234
  struct ggml_cgraph * graph,
6109
6235
  struct ggml_tensor * k_cur,
6110
6236
  struct ggml_tensor * v_cur,
6111
- int64_t n_ctx,
6112
6237
  int32_t n_tokens,
6113
6238
  int32_t kv_head,
6114
6239
  const llm_build_cb & cb,
6115
6240
  int64_t il) {
6241
+ const int64_t n_ctx = cparams.n_ctx;
6242
+
6116
6243
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6117
6244
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6118
6245
 
6119
6246
  GGML_ASSERT(kv.size == n_ctx);
6120
6247
 
6121
- // compute the transposed [n_tokens, n_embd] V matrix
6122
- assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6123
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
6124
- cb(v_cur_t, "v_cur_t", il);
6125
-
6126
6248
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
6127
6249
  (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
6128
6250
  cb(k_cache_view, "k_cache_view", il);
6129
6251
 
6130
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6131
- ( n_ctx)*ggml_element_size(kv.v_l[il]),
6132
- (kv_head)*ggml_element_size(kv.v_l[il]));
6252
+ // note: storing RoPE-ed version of K in the KV cache
6253
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6254
+
6255
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6256
+
6257
+ struct ggml_tensor * v_cache_view = nullptr;
6258
+
6259
+ if (cparams.flash_attn) {
6260
+ v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
6261
+ (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
6262
+ } else {
6263
+ // note: the V cache is transposed when not using flash attention
6264
+ v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6265
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
6266
+ (kv_head)*ggml_element_size(kv.v_l[il]));
6267
+
6268
+ v_cur = ggml_transpose(ctx, v_cur);
6269
+ }
6133
6270
  cb(v_cache_view, "v_cache_view", il);
6134
6271
 
6135
- // important: storing RoPE-ed version of K in the KV cache!
6136
- ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6137
- ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
6272
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
6138
6273
  }
6139
6274
 
6140
6275
  static struct ggml_tensor * llm_build_norm(
@@ -6354,11 +6489,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
6354
6489
  return moe_out;
6355
6490
  }
6356
6491
 
6357
- // if max_alibi_bias > 0 then apply ALiBi
6358
6492
  static struct ggml_tensor * llm_build_kqv(
6359
6493
  struct ggml_context * ctx,
6360
6494
  const llama_model & model,
6361
6495
  const llama_hparams & hparams,
6496
+ const llama_cparams & cparams,
6362
6497
  const llama_kv_cache & kv,
6363
6498
  struct ggml_cgraph * graph,
6364
6499
  struct ggml_tensor * wo,
@@ -6366,12 +6501,12 @@ static struct ggml_tensor * llm_build_kqv(
6366
6501
  struct ggml_tensor * q_cur,
6367
6502
  struct ggml_tensor * kq_mask,
6368
6503
  struct ggml_tensor * kq_pos,
6369
- int64_t n_ctx,
6370
6504
  int32_t n_tokens,
6371
6505
  int32_t n_kv,
6372
6506
  float kq_scale,
6373
6507
  const llm_build_cb & cb,
6374
6508
  int il) {
6509
+ const int64_t n_ctx = cparams.n_ctx;
6375
6510
  const int64_t n_head = hparams.n_head;
6376
6511
  const int64_t n_head_kv = hparams.n_head_kv;
6377
6512
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -6389,71 +6524,99 @@ static struct ggml_tensor * llm_build_kqv(
6389
6524
  0);
6390
6525
  cb(k, "k", il);
6391
6526
 
6392
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6393
- cb(kq, "kq", il);
6527
+ struct ggml_tensor * cur;
6528
+
6529
+ if (cparams.flash_attn) {
6530
+ GGML_UNUSED(model);
6531
+ GGML_UNUSED(n_ctx);
6394
6532
 
6395
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6396
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6397
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6398
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6399
- }
6533
+ // note: if this assert triggers, then some check has failed earlier
6534
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
+ GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6400
6536
 
6401
- if (model.arch == LLM_ARCH_GROK) {
6402
- // need to do the following:
6403
- // multiply by attn_output_multiplyer of 0.08838834764831845
6404
- // and then :
6405
- // kq = 30 * tanh(kq / 30)
6406
- // before the softmax below
6537
+ // split cached v into n_head heads (not transposed)
6538
+ struct ggml_tensor * v =
6539
+ ggml_view_3d(ctx, kv.v_l[il],
6540
+ n_embd_head_v, n_kv, n_head_kv,
6541
+ ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6543
+ 0);
6544
+ cb(v, "v", il);
6407
6545
 
6408
- //try from phi2
6409
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6546
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6410
6547
 
6411
- kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6412
- kq = ggml_scale(ctx, kq, 30);
6413
- }
6548
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6549
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
+ }
6551
+
6552
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6553
+ } else {
6554
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
+ cb(kq, "kq", il);
6556
+
6557
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6558
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6559
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6560
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6561
+ }
6562
+
6563
+ if (model.arch == LLM_ARCH_GROK) {
6564
+ // need to do the following:
6565
+ // multiply by attn_output_multiplyer of 0.08838834764831845
6566
+ // and then :
6567
+ // kq = 30 * tanh(kq / 30)
6568
+ // before the softmax below
6569
+
6570
+ //try from phi2
6571
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6572
+
6573
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6574
+ kq = ggml_scale(ctx, kq, 30);
6575
+ }
6414
6576
 
6415
6577
  #if defined(GGML_USE_KOMPUTE)
6416
6578
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6417
6579
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6418
6580
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6419
- if (hparams.f_max_alibi_bias > 0.0f) {
6420
- kq = ggml_scale(ctx, kq, kq_scale);
6421
- cb(kq, "kq_scaled", il);
6581
+ if (hparams.use_alibi) {
6582
+ kq = ggml_scale(ctx, kq, kq_scale);
6583
+ cb(kq, "kq_scaled", il);
6422
6584
 
6423
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6424
- cb(kq, "kq_scaled_alibi", il);
6585
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
+ cb(kq, "kq_scaled_alibi", il);
6425
6587
 
6426
- kq = ggml_add(ctx, kq, kq_mask);
6427
- cb(kq, "kq_masked", il);
6588
+ kq = ggml_add(ctx, kq, kq_mask);
6589
+ cb(kq, "kq_masked", il);
6428
6590
 
6429
- kq = ggml_soft_max(ctx, kq);
6430
- cb(kq, "kq_soft_max", il);
6431
- } else
6591
+ kq = ggml_soft_max(ctx, kq);
6592
+ cb(kq, "kq_soft_max", il);
6593
+ } else
6432
6594
  #endif
6433
- {
6434
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6435
- cb(kq, "kq_soft_max_ext", il);
6436
- }
6595
+ {
6596
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
+ cb(kq, "kq_soft_max_ext", il);
6598
+ }
6437
6599
 
6438
- GGML_ASSERT(kv.size == n_ctx);
6600
+ GGML_ASSERT(kv.size == n_ctx);
6439
6601
 
6440
- // split cached v into n_head heads
6441
- struct ggml_tensor * v =
6442
- ggml_view_3d(ctx, kv.v_l[il],
6443
- n_kv, n_embd_head_v, n_head_kv,
6444
- ggml_element_size(kv.v_l[il])*n_ctx,
6445
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6446
- 0);
6447
- cb(v, "v", il);
6602
+ // split cached v into n_head heads
6603
+ struct ggml_tensor * v =
6604
+ ggml_view_3d(ctx, kv.v_l[il],
6605
+ n_kv, n_embd_head_v, n_head_kv,
6606
+ ggml_element_size(kv.v_l[il])*n_ctx,
6607
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6608
+ 0);
6609
+ cb(v, "v", il);
6448
6610
 
6449
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6450
- cb(kqv, "kqv", il);
6611
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6612
+ cb(kqv, "kqv", il);
6451
6613
 
6452
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6453
- cb(kqv_merged, "kqv_merged", il);
6614
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
+ cb(kqv_merged, "kqv_merged", il);
6454
6616
 
6455
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6456
- cb(cur, "kqv_merged_cont", il);
6617
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6618
+ cb(cur, "kqv_merged_cont", il);
6619
+ }
6457
6620
 
6458
6621
  ggml_build_forward_expand(graph, cur);
6459
6622
 
@@ -6473,6 +6636,7 @@ static struct ggml_tensor * llm_build_kv(
6473
6636
  struct ggml_context * ctx,
6474
6637
  const llama_model & model,
6475
6638
  const llama_hparams & hparams,
6639
+ const llama_cparams & cparams,
6476
6640
  const llama_kv_cache & kv,
6477
6641
  struct ggml_cgraph * graph,
6478
6642
  struct ggml_tensor * wo,
@@ -6482,7 +6646,6 @@ static struct ggml_tensor * llm_build_kv(
6482
6646
  struct ggml_tensor * q_cur,
6483
6647
  struct ggml_tensor * kq_mask,
6484
6648
  struct ggml_tensor * kq_pos,
6485
- int64_t n_ctx,
6486
6649
  int32_t n_tokens,
6487
6650
  int32_t kv_head,
6488
6651
  int32_t n_kv,
@@ -6496,12 +6659,12 @@ static struct ggml_tensor * llm_build_kv(
6496
6659
  ggml_build_forward_expand(graph, k_cur);
6497
6660
  ggml_build_forward_expand(graph, v_cur);
6498
6661
 
6499
- llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
6662
+ llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
6500
6663
 
6501
6664
  struct ggml_tensor * cur;
6502
6665
 
6503
- cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
6504
- q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
6666
+ cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
+ q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6505
6668
  cb(cur, "kqv_out", il);
6506
6669
 
6507
6670
  return cur;
@@ -6543,6 +6706,8 @@ struct llm_build_context {
6543
6706
  const int32_t kv_head; // index of where we store new KV data in the cache
6544
6707
  const int32_t n_orig_ctx;
6545
6708
 
6709
+ const bool flash_attn;
6710
+
6546
6711
  const enum llama_pooling_type pooling_type;
6547
6712
  const enum llama_rope_type rope_type;
6548
6713
 
@@ -6589,6 +6754,7 @@ struct llm_build_context {
6589
6754
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6590
6755
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6591
6756
  n_orig_ctx (cparams.n_yarn_orig_ctx),
6757
+ flash_attn (cparams.flash_attn),
6592
6758
  pooling_type (cparams.pooling_type),
6593
6759
  rope_type (hparams.rope_type),
6594
6760
  cb (cb),
@@ -6703,15 +6869,31 @@ struct llm_build_context {
6703
6869
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6704
6870
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
6705
6871
 
6706
- ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6707
- nm, n_embd_v_gqa,
6708
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6709
- ggml_row_size(kv_self.v_l[il]->type, i));
6872
+ ggml_tensor * view_v_src;
6873
+ ggml_tensor * view_v_dst;
6874
+
6875
+ if (flash_attn) {
6876
+ // NOTE: the V cache is not transposed when using flash attention
6877
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6878
+ n_embd_v_gqa, nm,
6879
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6880
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
6710
6881
 
6711
- ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6712
- nm, n_embd_v_gqa,
6713
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6714
- ggml_row_size(kv_self.v_l[il]->type, id));
6882
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6883
+ n_embd_v_gqa, nm,
6884
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6885
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
6886
+ } else {
6887
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6888
+ nm, n_embd_v_gqa,
6889
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6890
+ ggml_row_size(kv_self.v_l[il]->type, i));
6891
+
6892
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6893
+ nm, n_embd_v_gqa,
6894
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6895
+ ggml_row_size(kv_self.v_l[il]->type, id));
6896
+ }
6715
6897
 
6716
6898
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
6717
6899
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
@@ -6741,20 +6923,26 @@ struct llm_build_context {
6741
6923
 
6742
6924
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
6743
6925
  if (causal) {
6744
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
6926
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6745
6927
  } else {
6746
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6928
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6747
6929
  }
6748
6930
  cb(lctx.inp_KQ_mask, "KQ_mask", -1);
6749
6931
  ggml_set_input(lctx.inp_KQ_mask);
6750
- return lctx.inp_KQ_mask;
6932
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6751
6933
  }
6752
6934
 
6753
- struct ggml_tensor * build_inp_KQ_pos() {
6754
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6935
+ struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
+ if (causal) {
6937
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
+ } else {
6939
+ // TODO: this will be needed for ALiBi-based BERT models
6940
+ // https://github.com/ggerganov/llama.cpp/pull/6826
6941
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
+ }
6755
6943
  cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6756
6944
  ggml_set_input(lctx.inp_KQ_pos);
6757
- return lctx.inp_KQ_pos;
6945
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6758
6946
  }
6759
6947
 
6760
6948
  struct ggml_tensor * build_inp_mean() {
@@ -6860,9 +7048,9 @@ struct llm_build_context {
6860
7048
  );
6861
7049
  cb(Kcur, "Kcur", il);
6862
7050
 
6863
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7051
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6864
7052
  model.layers[il].wo, model.layers[il].bo,
6865
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7053
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6866
7054
  }
6867
7055
 
6868
7056
  if (il == n_layer - 1) {
@@ -7000,9 +7188,9 @@ struct llm_build_context {
7000
7188
  cb(Qcur, "Qcur", il);
7001
7189
  cb(Kcur, "Kcur", il);
7002
7190
 
7003
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7191
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7004
7192
  model.layers[il].wo, NULL,
7005
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7193
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7006
7194
  }
7007
7195
 
7008
7196
  if (il == n_layer - 1) {
@@ -7107,9 +7295,9 @@ struct llm_build_context {
7107
7295
  ext_factor, attn_factor, beta_fast, beta_slow
7108
7296
  );
7109
7297
  cb(Kcur, "Kcur", il);
7110
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7298
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7111
7299
  model.layers[il].wo, NULL,
7112
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7300
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7113
7301
  }
7114
7302
 
7115
7303
  if (il == n_layer - 1) {
@@ -7227,9 +7415,9 @@ struct llm_build_context {
7227
7415
  );
7228
7416
  cb(Kcur, "Kcur", il);
7229
7417
 
7230
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7418
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7231
7419
  model.layers[il].wo, NULL,
7232
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7420
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7233
7421
  }
7234
7422
 
7235
7423
  if (il == n_layer - 1) {
@@ -7352,9 +7540,9 @@ struct llm_build_context {
7352
7540
  );
7353
7541
  cb(Kcur, "Kcur", il);
7354
7542
 
7355
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7543
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7356
7544
  model.layers[il].wo, model.layers[il].bo,
7357
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7545
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7358
7546
  }
7359
7547
 
7360
7548
  if (il == n_layer - 1) {
@@ -7504,9 +7692,9 @@ struct llm_build_context {
7504
7692
  );
7505
7693
  cb(Kcur, "Kcur", il);
7506
7694
 
7507
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
- model.layers[il].wo, NULL,
7509
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7695
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
+ model.layers[il].wo, NULL,
7697
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7510
7698
  }
7511
7699
 
7512
7700
  if (il == n_layer - 1) {
@@ -7616,9 +7804,9 @@ struct llm_build_context {
7616
7804
 
7617
7805
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7618
7806
 
7619
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7807
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7620
7808
  model.layers[il].wo, model.layers[il].bo,
7621
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7622
7810
  }
7623
7811
 
7624
7812
  if (il == n_layer - 1) {
@@ -7820,9 +8008,9 @@ struct llm_build_context {
7820
8008
  );
7821
8009
  cb(Vcur, "Vcur", il);
7822
8010
 
7823
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8011
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7824
8012
  model.layers[il].wo, model.layers[il].bo,
7825
- Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8013
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7826
8014
  }
7827
8015
 
7828
8016
  if (il == n_layer - 1) {
@@ -7916,9 +8104,9 @@ struct llm_build_context {
7916
8104
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7917
8105
  cb(Qcur, "Qcur", il);
7918
8106
 
7919
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8107
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7920
8108
  model.layers[il].wo, NULL,
7921
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8109
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7922
8110
  }
7923
8111
 
7924
8112
  if (il == n_layer - 1) {
@@ -8209,9 +8397,9 @@ struct llm_build_context {
8209
8397
 
8210
8398
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8211
8399
 
8212
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8400
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8213
8401
  model.layers[il].wo, model.layers[il].bo,
8214
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8402
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8215
8403
  }
8216
8404
 
8217
8405
  if (il == n_layer - 1) {
@@ -8340,14 +8528,15 @@ struct llm_build_context {
8340
8528
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8341
8529
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8342
8530
 
8343
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8344
- model.layers[il].wo, model.layers[il].bo,
8345
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8531
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
+ model.layers[il].wo, model.layers[il].bo,
8533
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8346
8534
  } else {
8347
8535
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8348
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8536
+
8537
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8349
8538
  model.layers[il].wo, model.layers[il].bo,
8350
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8539
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8351
8540
  }
8352
8541
  }
8353
8542
 
@@ -8489,9 +8678,9 @@ struct llm_build_context {
8489
8678
  );
8490
8679
  cb(Kcur, "Kcur", il);
8491
8680
 
8492
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8681
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8493
8682
  model.layers[il].wo, NULL,
8494
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8683
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8495
8684
  }
8496
8685
 
8497
8686
  if (il == n_layer - 1) {
@@ -8607,9 +8796,9 @@ struct llm_build_context {
8607
8796
  );
8608
8797
  cb(Kcur, "Kcur", il);
8609
8798
 
8610
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8799
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8611
8800
  model.layers[il].wo, NULL,
8612
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8801
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
8802
  }
8614
8803
 
8615
8804
  if (il == n_layer - 1) {
@@ -8720,9 +8909,9 @@ struct llm_build_context {
8720
8909
  );
8721
8910
  cb(Kcur, "Kcur", il);
8722
8911
 
8723
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8912
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8724
8913
  model.layers[il].wo, model.layers[il].bo,
8725
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8914
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8726
8915
  }
8727
8916
 
8728
8917
  if (il == n_layer - 1) {
@@ -8834,9 +9023,9 @@ struct llm_build_context {
8834
9023
  );
8835
9024
  cb(Kcur, "Kcur", il);
8836
9025
 
8837
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9026
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8838
9027
  model.layers[il].wo, model.layers[il].bo,
8839
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9028
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8840
9029
  }
8841
9030
 
8842
9031
  if (il == n_layer - 1) {
@@ -8989,9 +9178,9 @@ struct llm_build_context {
8989
9178
  );
8990
9179
  cb(Kcur, "Kcur", il);
8991
9180
 
8992
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9181
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8993
9182
  model.layers[il].wo, model.layers[il].bo,
8994
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9183
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8995
9184
  }
8996
9185
 
8997
9186
  if (il == n_layer - 1) {
@@ -9106,9 +9295,9 @@ struct llm_build_context {
9106
9295
  );
9107
9296
  cb(Kcur, "Kcur", il);
9108
9297
 
9109
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9110
- model.layers[il].wo, NULL,
9111
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9298
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
+ model.layers[il].wo, model.layers[il].bo,
9300
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9112
9301
  }
9113
9302
 
9114
9303
  if (il == n_layer - 1) {
@@ -9219,9 +9408,9 @@ struct llm_build_context {
9219
9408
  ext_factor, attn_factor, beta_fast, beta_slow);
9220
9409
  cb(Kcur, "Kcur", il);
9221
9410
 
9222
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9411
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9223
9412
  model.layers[il].wo, NULL,
9224
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9413
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9225
9414
  }
9226
9415
  struct ggml_tensor * sa_out = cur;
9227
9416
 
@@ -9322,9 +9511,9 @@ struct llm_build_context {
9322
9511
 
9323
9512
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9324
9513
 
9325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9514
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9326
9515
  model.layers[il].wo, model.layers[il].bo,
9327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9516
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9328
9517
  }
9329
9518
 
9330
9519
  if (il == n_layer - 1) {
@@ -9429,9 +9618,9 @@ struct llm_build_context {
9429
9618
  );
9430
9619
  cb(Kcur, "Kcur", il);
9431
9620
 
9432
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9621
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9433
9622
  model.layers[il].wo, model.layers[il].bo,
9434
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9623
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9435
9624
  }
9436
9625
 
9437
9626
  if (il == n_layer - 1) {
@@ -9545,9 +9734,9 @@ struct llm_build_context {
9545
9734
  );
9546
9735
  cb(Kcur, "Kcur", il);
9547
9736
 
9548
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9737
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9549
9738
  model.layers[il].wo, NULL,
9550
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9739
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9551
9740
  }
9552
9741
 
9553
9742
  if (il == n_layer - 1) {
@@ -9662,9 +9851,9 @@ struct llm_build_context {
9662
9851
  );
9663
9852
  cb(Kcur, "Kcur", il);
9664
9853
 
9665
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9854
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9666
9855
  model.layers[il].wo, model.layers[il].bo,
9667
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9856
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9668
9857
  }
9669
9858
 
9670
9859
  if (il == n_layer - 1) {
@@ -9792,9 +9981,9 @@ struct llm_build_context {
9792
9981
  );
9793
9982
  cb(Kcur, "Kcur", il);
9794
9983
 
9795
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9984
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9796
9985
  model.layers[il].wo, model.layers[il].bo,
9797
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9986
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9798
9987
  }
9799
9988
 
9800
9989
  if (il == n_layer - 1) {
@@ -9913,9 +10102,9 @@ struct llm_build_context {
9913
10102
  ext_factor, attn_factor, beta_fast, beta_slow);
9914
10103
  cb(Kcur, "Kcur", il);
9915
10104
 
9916
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10105
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9917
10106
  model.layers[il].wo, NULL,
9918
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10107
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9919
10108
  }
9920
10109
 
9921
10110
  if (il == n_layer - 1) {
@@ -10032,9 +10221,9 @@ struct llm_build_context {
10032
10221
  );
10033
10222
  cb(Kcur, "Kcur", il);
10034
10223
 
10035
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10224
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10036
10225
  model.layers[il].wo, model.layers[il].bo,
10037
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10226
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10038
10227
  }
10039
10228
 
10040
10229
  if (il == n_layer - 1) {
@@ -10322,9 +10511,9 @@ struct llm_build_context {
10322
10511
  );
10323
10512
  cb(Kcur, "Kcur", il);
10324
10513
 
10325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10514
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10326
10515
  model.layers[il].wo, model.layers[il].bo,
10327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10516
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10328
10517
  }
10329
10518
 
10330
10519
  if (il == n_layer - 1) {
@@ -10453,9 +10642,9 @@ struct llm_build_context {
10453
10642
  );
10454
10643
  cb(Kcur, "Kcur", il);
10455
10644
 
10456
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10645
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10457
10646
  model.layers[il].wo, nullptr,
10458
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10647
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10459
10648
  }
10460
10649
 
10461
10650
  if (il == n_layer - 1) {
@@ -10882,7 +11071,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
10882
11071
  }
10883
11072
  }
10884
11073
 
10885
- if (hparams.need_kq_pos) {
11074
+ // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
+ // this allows to process multiple sequences in parallel with ALiBi-based models
11076
+ if (hparams.use_alibi) {
10886
11077
  const int64_t n_kv = kv_self.n;
10887
11078
 
10888
11079
  GGML_ASSERT(lctx.inp_KQ_pos);
@@ -11264,7 +11455,7 @@ static int llama_decode_internal(
11264
11455
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11265
11456
  // after enough generations, the benefit from this heuristic disappears
11266
11457
  // if we start defragmenting the cache, the benefit from this will be more important
11267
- kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
11458
+ kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11268
11459
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11269
11460
  }
11270
11461
  }
@@ -11432,6 +11623,10 @@ static int llama_decode_internal(
11432
11623
  }
11433
11624
  }
11434
11625
 
11626
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
11627
+ // overlap with device computation.
11628
+ ggml_backend_sched_reset(lctx.sched);
11629
+
11435
11630
  return 0;
11436
11631
  }
11437
11632
 
@@ -11457,7 +11652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
11457
11652
  // each move requires 6*n_layer tensors (see build_defrag)
11458
11653
  // - source view, destination view, copy operation
11459
11654
  // - x2 for keys and values
11460
- const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11655
+ //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11656
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11657
+ const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
11461
11658
 
11462
11659
  // determine which KV cells to move where
11463
11660
  //
@@ -11773,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
11773
11970
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11774
11971
  GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
11775
11972
  GGML_ASSERT(llama_is_byte_token(vocab, id));
11776
- const auto& token_data = vocab.id_to_token.at(id);
11973
+ const auto & token_data = vocab.id_to_token.at(id);
11777
11974
  switch (llama_vocab_get_type(vocab)) {
11778
11975
  case LLAMA_VOCAB_TYPE_SPM: {
11779
11976
  auto buf = token_data.text.substr(3, 2);
@@ -11781,7 +11978,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11781
11978
  }
11782
11979
  case LLAMA_VOCAB_TYPE_BPE: {
11783
11980
  GGML_ASSERT(false);
11784
- return unicode_utf8_to_byte(token_data.text);
11981
+ return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
11785
11982
  }
11786
11983
  case LLAMA_VOCAB_TYPE_WPM: {
11787
11984
  GGML_ASSERT(false);
@@ -12003,7 +12200,94 @@ struct llm_tokenizer_bpe {
12003
12200
 
12004
12201
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12005
12202
  int final_prev_index = -1;
12006
- auto word_collection = bpe_gpt2_preprocess(text);
12203
+
12204
+ std::vector<std::string> word_collection;
12205
+ switch (vocab.type) {
12206
+ case LLAMA_VOCAB_TYPE_BPE:
12207
+ switch (vocab.type_pre) {
12208
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12210
+ word_collection = unicode_regex_split(text, {
12211
+ // original regex from tokenizer.json
12212
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12213
+
12214
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12215
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
+ });
12217
+ break;
12218
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
+ word_collection = unicode_regex_split(text, {
12220
+ "[\r\n]",
12221
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12222
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12223
+ "\\s+$",
12224
+ "[一-龥ࠀ-一가-퟿]+",
12225
+ "\\p{N}+",
12226
+ });
12227
+ break;
12228
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12229
+ word_collection = unicode_regex_split(text, {
12230
+ "[\r\n]",
12231
+ "\\s?\\p{L}+",
12232
+ "\\s?\\p{P}+",
12233
+ "[一-龥ࠀ-一가-퟿]+",
12234
+ "\\p{N}",
12235
+ });
12236
+ break;
12237
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
12238
+ word_collection = unicode_regex_split(text, {
12239
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12240
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
+ "[0-9][0-9][0-9]",
12242
+ });
12243
+ break;
12244
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
12245
+ // TODO: MPT pre-tokenization regexes are unknown
12246
+ // the following are close, but not exact. run the following:
12247
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12248
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12249
+ word_collection = unicode_regex_split(text, {
12250
+ "\\s?\\p{L}+",
12251
+ "\\s?\\p{P}+",
12252
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12253
+ });
12254
+ break;
12255
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12256
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
12257
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
12258
+ word_collection = unicode_regex_split(text, {
12259
+ "\\p{N}",
12260
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12261
+ });
12262
+ break;
12263
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
12264
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
12265
+ word_collection = unicode_regex_split(text, {
12266
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12267
+ });
12268
+ break;
12269
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270
+ word_collection = unicode_regex_split(text, {
12271
+ // original regex from tokenizer.json
12272
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
12273
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12274
+ });
12275
+ break;
12276
+ default:
12277
+ // default regex for BPE tokenization pre-processing
12278
+ word_collection = unicode_regex_split(text, {
12279
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12280
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12281
+ "\\p{N}+",
12282
+ "[0-9][0-9][0-9]",
12283
+ });
12284
+ break;
12285
+ }
12286
+ break;
12287
+ default:
12288
+ GGML_ASSERT(false);
12289
+ break;
12290
+ }
12007
12291
 
12008
12292
  symbols_final.clear();
12009
12293
 
@@ -12130,145 +12414,6 @@ private:
12130
12414
  work_queue.push(bigram);
12131
12415
  }
12132
12416
 
12133
- std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
12134
- std::vector<std::string> bpe_words;
12135
- std::vector<std::string> bpe_encoded_words;
12136
-
12137
- std::string token = "";
12138
- // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
12139
- bool collecting_numeric = false;
12140
- bool collecting_letter = false;
12141
- bool collecting_special = false;
12142
- bool collecting_whitespace_lookahead = false;
12143
- bool collecting = false;
12144
-
12145
- std::vector<std::string> text_utf;
12146
- text_utf.reserve(text.size());
12147
- bpe_words.reserve(text.size());
12148
- bpe_encoded_words.reserve(text.size());
12149
-
12150
- const auto cpts = unicode_cpts_from_utf8(text);
12151
- for (size_t i = 0; i < cpts.size(); ++i)
12152
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
12153
-
12154
- for (int i = 0; i < (int)text_utf.size(); i++) {
12155
- const std::string & utf_char = text_utf[i];
12156
- bool split_condition = false;
12157
- int bytes_remain = text_utf.size() - i;
12158
- // forward backward lookups
12159
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
12160
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
12161
-
12162
- // handling contractions
12163
- if (!split_condition && bytes_remain >= 2) {
12164
- // 's|'t|'m|'d
12165
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
12166
- split_condition = true;
12167
- }
12168
- if (split_condition) {
12169
- if (token.size()) {
12170
- bpe_words.emplace_back(token); // push previous content as token
12171
- }
12172
- token = utf_char + utf_char_next;
12173
- bpe_words.emplace_back(token);
12174
- token = "";
12175
- i++;
12176
- continue;
12177
- }
12178
- }
12179
- if (!split_condition && bytes_remain >= 3) {
12180
- // 're|'ve|'ll
12181
- if (utf_char == "\'" && (
12182
- (utf_char_next == "r" && utf_char_next_next == "e") ||
12183
- (utf_char_next == "v" && utf_char_next_next == "e") ||
12184
- (utf_char_next == "l" && utf_char_next_next == "l"))
12185
- ) {
12186
- split_condition = true;
12187
- }
12188
- if (split_condition) {
12189
- // current token + next token can be defined
12190
- if (token.size()) {
12191
- bpe_words.emplace_back(token); // push previous content as token
12192
- }
12193
- token = utf_char + utf_char_next + utf_char_next_next;
12194
- bpe_words.emplace_back(token); // the contraction
12195
- token = "";
12196
- i += 2;
12197
- continue;
12198
- }
12199
- }
12200
-
12201
- if (!split_condition && !collecting) {
12202
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
12203
- collecting_letter = true;
12204
- collecting = true;
12205
- }
12206
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12207
- collecting_numeric = true;
12208
- collecting = true;
12209
- }
12210
- else if (
12211
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
12212
- (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
12213
- ) {
12214
- collecting_special = true;
12215
- collecting = true;
12216
- }
12217
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
12218
- collecting_whitespace_lookahead = true;
12219
- collecting = true;
12220
- }
12221
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
12222
- split_condition = true;
12223
- }
12224
- }
12225
- else if (!split_condition && collecting) {
12226
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
12227
- split_condition = true;
12228
- }
12229
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
12230
- split_condition = true;
12231
- }
12232
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
12233
- split_condition = true;
12234
- }
12235
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12236
- split_condition = true;
12237
- }
12238
- }
12239
-
12240
- if (utf_char_next == "") {
12241
- split_condition = true; // final
12242
- token += utf_char;
12243
- }
12244
-
12245
- if (split_condition) {
12246
- if (token.size()) {
12247
- bpe_words.emplace_back(token);
12248
- }
12249
- token = utf_char;
12250
- collecting = false;
12251
- collecting_letter = false;
12252
- collecting_numeric = false;
12253
- collecting_special = false;
12254
- collecting_whitespace_lookahead = false;
12255
- }
12256
- else {
12257
- token += utf_char;
12258
- }
12259
- }
12260
-
12261
- for (std::string & word : bpe_words) {
12262
- std::string encoded_token = "";
12263
- for (char & c : word) {
12264
- encoded_token += unicode_byte_to_utf8(c);
12265
- }
12266
- bpe_encoded_words.emplace_back(encoded_token);
12267
- }
12268
-
12269
- return bpe_encoded_words;
12270
- }
12271
-
12272
12417
  const llama_vocab & vocab;
12273
12418
 
12274
12419
  std::vector<llm_symbol> symbols;
@@ -12343,7 +12488,7 @@ struct llm_tokenizer_wpm {
12343
12488
  continue;
12344
12489
  }
12345
12490
  code = unicode_tolower(code);
12346
- if (type == CODEPOINT_TYPE_WHITESPACE) {
12491
+ if (type == CODEPOINT_TYPE_SEPARATOR) {
12347
12492
  code = ' ';
12348
12493
  }
12349
12494
  std::string s = unicode_cpt_to_utf8(code);
@@ -12588,7 +12733,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12588
12733
  } break;
12589
12734
  case LLAMA_VOCAB_TYPE_BPE:
12590
12735
  {
12591
- if (add_special && vocab.special_add_bos == 1) {
12736
+ if (add_special && vocab.special_add_bos != 0) {
12592
12737
  GGML_ASSERT(vocab.special_bos_id != -1);
12593
12738
  output.push_back(vocab.special_bos_id);
12594
12739
  }
@@ -14030,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
14030
14175
  if (qtype.to_float == NULL) {
14031
14176
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
14032
14177
  }
14033
- } else if (tensor->type != GGML_TYPE_F16) {
14178
+ } else if (tensor->type != GGML_TYPE_F16 &&
14179
+ tensor->type != GGML_TYPE_BF16) {
14034
14180
  throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
14035
14181
  }
14036
14182
 
14037
14183
  if (nthread < 2) {
14038
14184
  if (tensor->type == GGML_TYPE_F16) {
14039
14185
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
14186
+ } else if (tensor->type == GGML_TYPE_BF16) {
14187
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
14040
14188
  } else if (ggml_is_quantized(tensor->type)) {
14041
14189
  qtype.to_float(tensor->data, f32_output, nelements);
14042
14190
  } else {
@@ -14045,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
14045
14193
  return;
14046
14194
  }
14047
14195
 
14048
- size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
14196
+ size_t block_size;
14197
+ if (tensor->type == GGML_TYPE_F16 ||
14198
+ tensor->type == GGML_TYPE_BF16) {
14199
+ block_size = 1;
14200
+ } else {
14201
+ block_size = (size_t)ggml_blck_size(tensor->type);
14202
+ }
14203
+
14049
14204
  size_t block_size_bytes = ggml_type_size(tensor->type);
14050
14205
 
14051
14206
  GGML_ASSERT(nelements % block_size == 0);
@@ -14064,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
14064
14219
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
14065
14220
  if (typ == GGML_TYPE_F16) {
14066
14221
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
14222
+ } else if (typ == GGML_TYPE_BF16) {
14223
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
14067
14224
  } else {
14068
14225
  qtype.to_float(inbuf, outbuf, nels);
14069
14226
  }
@@ -14360,14 +14517,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14360
14517
  }
14361
14518
 
14362
14519
  static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14363
- std::mutex mutex;
14364
- int64_t counter = 0;
14365
- size_t new_size = 0;
14366
14520
  if (nthread < 2) {
14367
14521
  // single-thread
14368
- return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14522
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14523
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
14524
+ throw std::runtime_error("quantized data validation failed");
14525
+ }
14526
+ return new_size;
14369
14527
  }
14370
- auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
14528
+
14529
+ std::mutex mutex;
14530
+ int64_t counter = 0;
14531
+ size_t new_size = 0;
14532
+ bool valid = true;
14533
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
14371
14534
  nrows, n_per_row, imatrix]() {
14372
14535
  const int64_t nrows_per_chunk = chunk_size / n_per_row;
14373
14536
  size_t local_size = 0;
@@ -14382,7 +14545,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14382
14545
  }
14383
14546
  lock.unlock();
14384
14547
  const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14385
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14548
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14549
+ local_size += this_size;
14550
+
14551
+ // validate the quantized data
14552
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
14553
+ void * this_data = (char *) new_data + first_row * row_size;
14554
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
14555
+ std::unique_lock<std::mutex> lock(mutex);
14556
+ valid = false;
14557
+ break;
14558
+ }
14386
14559
  }
14387
14560
  };
14388
14561
  for (int it = 0; it < nthread - 1; ++it) {
@@ -14391,6 +14564,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14391
14564
  compute();
14392
14565
  for (auto & w : workers) { w.join(); }
14393
14566
  workers.clear();
14567
+ if (!valid) {
14568
+ throw std::runtime_error("quantized data validation failed");
14569
+ }
14394
14570
  return new_size;
14395
14571
  }
14396
14572
 
@@ -14405,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14405
14581
  case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
14406
14582
  case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
14407
14583
  case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
14584
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
14408
14585
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
14409
14586
 
14410
14587
  // K-quants
@@ -14453,7 +14630,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14453
14630
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
14454
14631
  kv_overrides = v->data();
14455
14632
  }
14456
- llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
14633
+ llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
14457
14634
  ml.init_mappings(false); // no prefetching
14458
14635
 
14459
14636
  llama_model model;
@@ -14491,11 +14668,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14491
14668
  for (auto & o : overrides) {
14492
14669
  if (o.key[0] == 0) break;
14493
14670
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
14494
- gguf_set_val_f32(ctx_out, o.key, o.float_value);
14671
+ gguf_set_val_f32(ctx_out, o.key, o.val_f64);
14495
14672
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
14496
- gguf_set_val_i32(ctx_out, o.key, o.int_value);
14673
+ gguf_set_val_i32(ctx_out, o.key, o.val_i64);
14497
14674
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
14498
- gguf_set_val_bool(ctx_out, o.key, o.bool_value);
14675
+ gguf_set_val_bool(ctx_out, o.key, o.val_bool);
14676
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
14677
+ gguf_set_val_str(ctx_out, o.key, o.val_str);
14499
14678
  } else {
14500
14679
  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
14501
14680
  }
@@ -14814,7 +14993,7 @@ static int llama_apply_lora_from_file_internal(
14814
14993
  std::unique_ptr<llama_model_loader> ml;
14815
14994
  if (path_base_model) {
14816
14995
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
14817
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
14996
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
14818
14997
  ml->init_mappings(/*prefetch*/ false); // no prefetching
14819
14998
  }
14820
14999
 
@@ -15073,6 +15252,7 @@ struct llama_model_params llama_model_default_params() {
15073
15252
  /*.vocab_only =*/ false,
15074
15253
  /*.use_mmap =*/ true,
15075
15254
  /*.use_mlock =*/ false,
15255
+ /*.check_tensors =*/ false,
15076
15256
  };
15077
15257
 
15078
15258
  #ifdef GGML_USE_METAL
@@ -15109,6 +15289,7 @@ struct llama_context_params llama_context_default_params() {
15109
15289
  /*.logits_all =*/ false,
15110
15290
  /*.embeddings =*/ false,
15111
15291
  /*.offload_kqv =*/ true,
15292
+ /*.flash_attn =*/ false,
15112
15293
  /*.abort_callback =*/ nullptr,
15113
15294
  /*.abort_callback_data =*/ nullptr,
15114
15295
  };
@@ -15275,6 +15456,7 @@ struct llama_context * llama_new_context_with_model(
15275
15456
  cparams.defrag_thold = params.defrag_thold;
15276
15457
  cparams.embeddings = params.embeddings;
15277
15458
  cparams.offload_kqv = params.offload_kqv;
15459
+ cparams.flash_attn = params.flash_attn;
15278
15460
  cparams.pooling_type = params.pooling_type;
15279
15461
 
15280
15462
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -15282,12 +15464,20 @@ struct llama_context * llama_new_context_with_model(
15282
15464
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15283
15465
 
15284
15466
  // this is necessary due to kv_self.n being padded later during inference
15285
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
15467
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15286
15468
 
15287
15469
  // with causal attention, the batch size is limited by the context size
15288
15470
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
15289
- cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15290
15471
 
15472
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
15473
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
15474
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
15475
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
15476
+ LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
15477
+ cparams.n_batch = GGML_KQ_MASK_PAD;
15478
+ }
15479
+
15480
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15291
15481
 
15292
15482
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15293
15483
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
@@ -15319,6 +15509,16 @@ struct llama_context * llama_new_context_with_model(
15319
15509
  }
15320
15510
  }
15321
15511
 
15512
+ if (cparams.flash_attn && hparams.use_alibi) {
15513
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
+ cparams.flash_attn = false;
15515
+ }
15516
+
15517
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
+ cparams.flash_attn = false;
15520
+ }
15521
+
15322
15522
  if (params.seed == LLAMA_DEFAULT_SEED) {
15323
15523
  params.seed = time(NULL);
15324
15524
  }
@@ -15326,6 +15526,7 @@ struct llama_context * llama_new_context_with_model(
15326
15526
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
15327
15527
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
15328
15528
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
15529
+ LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
15329
15530
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
15330
15531
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
15331
15532
 
@@ -15454,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
15454
15655
  }
15455
15656
  ctx->backends.push_back(ctx->backend_cpu);
15456
15657
 
15457
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
15658
+ if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
15458
15659
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
15459
15660
  llama_free(ctx);
15460
15661
  return nullptr;
@@ -16053,6 +16254,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16053
16254
  const size_t s_kv_head = sizeof(uint32_t);
16054
16255
  const size_t s_kv_size = sizeof(uint32_t);
16055
16256
  const size_t s_kv_used = sizeof(uint32_t);
16257
+ const size_t s_v_trans = sizeof(uint32_t);
16056
16258
  const size_t s_kv = ctx->kv_self.total_size();
16057
16259
  const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
16058
16260
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
@@ -16070,10 +16272,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16070
16272
  + s_kv_head
16071
16273
  + s_kv_size
16072
16274
  + s_kv_used
16275
+ + s_v_trans
16073
16276
  + s_kv
16074
16277
  + s_kv_cells
16075
16278
  );
16076
16279
 
16280
+ // on session change it is very likely that the state size has changed - so we need to update this function
16281
+ static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
16282
+
16077
16283
  return s_total;
16078
16284
  }
16079
16285
 
@@ -16219,11 +16425,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16219
16425
  const uint32_t kv_size = kv_self.size;
16220
16426
  const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
16221
16427
  const uint32_t kv_used = kv_self.used;
16428
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
16222
16429
 
16223
16430
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
16224
16431
  data_ctx->write(&kv_head, sizeof(kv_head));
16225
16432
  data_ctx->write(&kv_size, sizeof(kv_size));
16226
16433
  data_ctx->write(&kv_used, sizeof(kv_used));
16434
+ data_ctx->write(&v_trans, sizeof(v_trans));
16227
16435
 
16228
16436
  if (kv_buf_size) {
16229
16437
  const size_t pre_kv_buf_size = data_ctx->get_size_written();
@@ -16236,7 +16444,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16236
16444
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
16237
16445
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
16238
16446
 
16239
- if (kv_self.recurrent) {
16447
+ if (kv_self.recurrent || !kv_self.v_trans) {
16240
16448
  // v is contiguous for recurrent models
16241
16449
  // TODO: use other tensors for state models than k and v
16242
16450
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16369,11 +16577,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16369
16577
  uint32_t kv_head;
16370
16578
  uint32_t kv_size;
16371
16579
  uint32_t kv_used;
16580
+ uint32_t v_trans;
16372
16581
 
16373
16582
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
16374
16583
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
16375
16584
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
16376
16585
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
16586
+ memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
16587
+
16588
+ GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
16377
16589
 
16378
16590
  if (kv_self.size != kv_size) {
16379
16591
  // the KV cache needs to be big enough to load all the KV cells from the saved state
@@ -16383,6 +16595,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16383
16595
  __func__, kv_head, kv_size, kv_self.size);
16384
16596
  }
16385
16597
 
16598
+ llama_kv_cache_clear(ctx);
16599
+
16386
16600
  if (kv_buf_size) {
16387
16601
  const size_t pre_kv_buf_size = inp - src;
16388
16602
 
@@ -16394,7 +16608,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16394
16608
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
16395
16609
  inp += k_size;
16396
16610
 
16397
- if (kv_self.recurrent) {
16611
+ if (kv_self.recurrent || !kv_self.v_trans) {
16398
16612
  // v is contiguous for recurrent models
16399
16613
  // TODO: use other tensors for state models than k and v
16400
16614
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16416,8 +16630,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16416
16630
  GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
16417
16631
  }
16418
16632
 
16419
- llama_kv_cache_clear(ctx);
16420
-
16421
16633
  ctx->kv_self.head = kv_head;
16422
16634
  ctx->kv_self.used = kv_used;
16423
16635
 
@@ -16677,28 +16889,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16677
16889
  }
16678
16890
  }
16679
16891
 
16680
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16681
- const uint32_t kv_size = kv_self.size;
16682
- for (int il = 0; il < (int)n_layer; ++il) {
16683
- // Write value type
16684
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16685
- data_ctx.write(&v_type_i, sizeof(v_type_i));
16892
+ // TODO: simplify, reduce copy-paste
16893
+ if (!kv_self.v_trans) {
16894
+ for (int il = 0; il < (int)n_layer; ++il) {
16895
+ // Write value type
16896
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16897
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16686
16898
 
16687
- // Write element size
16688
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16689
- data_ctx.write(&v_size_el, sizeof(v_size_el));
16899
+ // Write row size of value
16900
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
16901
+ data_ctx.write(&v_size_row, sizeof(v_size_row));
16690
16902
 
16691
- // For each row, we get the element values of each cell
16692
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16693
- // Read each range of cells of v_size_el length each into tmp_buf and write out
16903
+ // Read each range of cells of v_size length each into tmp_buf and write out
16694
16904
  for (const auto & range : cell_ranges) {
16695
16905
  const size_t range_size = range.second - range.first;
16696
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16697
- tmp_buf.resize(range_size * v_size_el);
16698
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16906
+ tmp_buf.resize(range_size * v_size_row);
16907
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
16699
16908
  data_ctx.write(tmp_buf.data(), tmp_buf.size());
16700
16909
  }
16701
16910
  }
16911
+ } else {
16912
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16913
+ const uint32_t kv_size = kv_self.size;
16914
+ for (int il = 0; il < (int)n_layer; ++il) {
16915
+ // Write value type
16916
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16917
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16918
+
16919
+ // Write element size
16920
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16921
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16922
+
16923
+ // For each row, we get the element values of each cell
16924
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16925
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16926
+ for (const auto & range : cell_ranges) {
16927
+ const size_t range_size = range.second - range.first;
16928
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16929
+ tmp_buf.resize(range_size * v_size_el);
16930
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16931
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16932
+ }
16933
+ }
16934
+ }
16702
16935
  }
16703
16936
 
16704
16937
  return data_ctx.get_size_written();
@@ -16823,41 +17056,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
16823
17056
  }
16824
17057
  }
16825
17058
 
16826
- // For each layer, read the values for each cell (transposed)
16827
- for (int il = 0; il < (int)n_layer; ++il) {
16828
- // Read type of value
16829
- int32_t v_type_i_ref;
16830
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16831
- inp += sizeof(v_type_i_ref);
16832
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16833
- if (v_type_i != v_type_i_ref) {
16834
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16835
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16836
- return 0;
16837
- }
17059
+ // TODO: simplify, reduce copy-paste
17060
+ if (!kv_self.v_trans) {
17061
+ for (int il = 0; il < (int)n_layer; ++il) {
17062
+ // Read type of value
17063
+ int32_t v_type_i_ref;
17064
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17065
+ inp += sizeof(v_type_i_ref);
17066
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17067
+ if (v_type_i != v_type_i_ref) {
17068
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17069
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17070
+ return 0;
17071
+ }
16838
17072
 
16839
- // Read element size of value
16840
- size_t v_size_el_ref;
16841
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16842
- inp += sizeof(v_size_el_ref);
16843
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16844
- if (v_size_el != v_size_el_ref) {
16845
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16846
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16847
- return 0;
16848
- }
17073
+ // Read row size of value
17074
+ size_t v_size_row_ref;
17075
+ memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
17076
+ inp += sizeof(v_size_row_ref);
17077
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
17078
+ if (v_size_row != v_size_row_ref) {
17079
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17080
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
17081
+ return 0;
17082
+ }
16849
17083
 
16850
- if (cell_count) {
16851
- // For each row in the transposed matrix, read the values for the whole cell range
16852
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16853
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16854
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16855
- inp += cell_count * v_size_el;
17084
+ if (cell_count) {
17085
+ // Read and set the values for the whole cell range
17086
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
17087
+ inp += cell_count * v_size_row;
17088
+ }
17089
+ }
17090
+ } else {
17091
+ // For each layer, read the values for each cell (transposed)
17092
+ for (int il = 0; il < (int)n_layer; ++il) {
17093
+ // Read type of value
17094
+ int32_t v_type_i_ref;
17095
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17096
+ inp += sizeof(v_type_i_ref);
17097
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17098
+ if (v_type_i != v_type_i_ref) {
17099
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17100
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17101
+ return 0;
17102
+ }
17103
+
17104
+ // Read element size of value
17105
+ size_t v_size_el_ref;
17106
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
17107
+ inp += sizeof(v_size_el_ref);
17108
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
17109
+ if (v_size_el != v_size_el_ref) {
17110
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17111
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
17112
+ return 0;
17113
+ }
17114
+
17115
+ if (cell_count) {
17116
+ // For each row in the transposed matrix, read the values for the whole cell range
17117
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
17118
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
17119
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
17120
+ inp += cell_count * v_size_el;
17121
+ }
16856
17122
  }
16857
17123
  }
16858
17124
  }
16859
17125
 
16860
17126
  const size_t nread = inp - src;
17127
+
16861
17128
  return nread;
16862
17129
  }
16863
17130
 
@@ -17238,9 +17505,10 @@ int32_t llama_tokenize(
17238
17505
 
17239
17506
  static std::string llama_decode_text(const std::string & text) {
17240
17507
  std::string decoded_text;
17241
- auto unicode_sequences = unicode_cpts_from_utf8(text);
17242
- for (auto & unicode_sequence : unicode_sequences) {
17243
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17508
+
17509
+ const auto cpts = unicode_cpts_from_utf8(text);
17510
+ for (const auto cpt : cpts) {
17511
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
17244
17512
  }
17245
17513
 
17246
17514
  return decoded_text;
@@ -17604,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
17604
17872
  /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
17605
17873
 
17606
17874
  /*.n_sample =*/ std::max(1, ctx->n_sample),
17607
- /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
17875
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
17608
17876
  /*.n_eval =*/ std::max(1, ctx->n_eval),
17609
17877
  };
17610
17878
 
@@ -17654,9 +17922,9 @@ const char * llama_print_system_info(void) {
17654
17922
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17655
17923
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
17924
  #ifdef GGML_USE_LLAMAFILE
17657
- s += "LAMMAFILE = 1 | ";
17925
+ s += "LLAMAFILE = 1 | ";
17658
17926
  #else
17659
- s += "LAMMAFILE = 0 | ";
17927
+ s += "LLAMAFILE = 0 | ";
17660
17928
  #endif
17661
17929
 
17662
17930
  return s.c_str();