llama_cpp 0.14.7 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,7 @@
75
75
  #include <forward_list>
76
76
  #include <fstream>
77
77
  #include <functional>
78
+ #include <future>
78
79
  #include <initializer_list>
79
80
  #include <locale>
80
81
  #include <map>
@@ -107,7 +108,6 @@
107
108
  #define LLAMA_MAX_NODES 8192
108
109
  #define LLAMA_MAX_EXPERTS 60
109
110
 
110
-
111
111
  //
112
112
  // logging
113
113
  //
@@ -316,6 +316,7 @@ enum llm_kv {
316
316
  LLM_KV_SSM_TIME_STEP_RANK,
317
317
 
318
318
  LLM_KV_TOKENIZER_MODEL,
319
+ LLM_KV_TOKENIZER_PRE,
319
320
  LLM_KV_TOKENIZER_LIST,
320
321
  LLM_KV_TOKENIZER_TOKEN_TYPE,
321
322
  LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
392
393
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
393
394
 
394
395
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
396
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
395
397
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
396
398
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
397
399
  { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@@ -1843,7 +1845,7 @@ struct llama_hparams {
1843
1845
  float f_logit_scale = 0.0f;
1844
1846
 
1845
1847
  bool causal_attn = true;
1846
- bool need_kq_pos = false;
1848
+ bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1847
1849
 
1848
1850
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1849
1851
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1933,6 +1935,7 @@ struct llama_cparams {
1933
1935
  bool embeddings;
1934
1936
  bool causal_attn;
1935
1937
  bool offload_kqv;
1938
+ bool flash_attn;
1936
1939
 
1937
1940
  enum llama_pooling_type pooling_type;
1938
1941
 
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
2036
2039
  bool has_shift = false;
2037
2040
  bool do_defrag = false;
2038
2041
  bool do_copy = false;
2039
- // with recurrent state models, a cell can hold the state for more than one past token
2040
- bool recurrent = false;
2042
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
2043
+ bool v_trans = true; // the value tensor is transposed
2041
2044
 
2042
2045
  // Note: The value of head isn't only used to optimize searching
2043
2046
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2114,7 +2117,8 @@ struct llama_vocab {
2114
2117
  ttype type;
2115
2118
  };
2116
2119
 
2117
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2120
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2121
+ enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2118
2122
 
2119
2123
  std::unordered_map<token, id> token_to_id;
2120
2124
  std::vector<token_data> id_to_token;
@@ -2335,11 +2339,14 @@ struct llama_context {
2335
2339
 
2336
2340
  static bool llama_kv_cache_init(
2337
2341
  struct llama_kv_cache & cache,
2338
- const llama_model & model,
2342
+ const llama_context * ctx,
2339
2343
  ggml_type type_k,
2340
2344
  ggml_type type_v,
2341
2345
  uint32_t kv_size,
2342
2346
  bool offload) {
2347
+ const llama_model & model = ctx->model;
2348
+ const llama_cparams & cparams = ctx->cparams;
2349
+
2343
2350
  const struct llama_hparams & hparams = model.hparams;
2344
2351
 
2345
2352
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
2350
2357
 
2351
2358
  // TODO: find a nicer way to add other recurrent model architectures
2352
2359
  cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2360
+ cache.v_trans = !cparams.flash_attn;
2353
2361
 
2354
- // TODO: support mixed reccurent Transformer architectues
2362
+ // TODO: support mixed recurrent Transformer architectures
2355
2363
  // NOTE: (!a || b) is a logical implication (a -> b)
2356
2364
  GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2357
2365
  GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2562
2570
  }
2563
2571
  cache.head = 0;
2564
2572
  cache.used = 0;
2573
+
2574
+ for (auto & buf : cache.bufs) {
2575
+ ggml_backend_buffer_clear(buf, 0);
2576
+ }
2565
2577
  }
2566
2578
 
2567
2579
  static bool llama_kv_cache_seq_rm(
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
2882
2894
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2883
2895
  case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2884
2896
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2897
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
2885
2898
  }
2886
2899
  return "unknown";
2887
2900
  }
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
2893
2906
  __func__, override_type_to_str(ovrd->tag), ovrd->key);
2894
2907
  switch (ovrd->tag) {
2895
2908
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2896
- LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2909
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
2897
2910
  } break;
2898
2911
  case LLAMA_KV_OVERRIDE_TYPE_INT: {
2899
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2912
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
2900
2913
  } break;
2901
2914
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2902
- LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2915
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
2916
+ } break;
2917
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
2918
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
2903
2919
  } break;
2904
2920
  default:
2905
2921
  // Shouldn't be possible to end up here, but just in case...
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
2918
2934
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2919
2935
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2920
2936
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2921
- target = ovrd->bool_value;
2937
+ target = ovrd->val_bool;
2922
2938
  return true;
2923
2939
  }
2924
2940
  return false;
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
2928
2944
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2929
2945
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2930
2946
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2931
- target = ovrd->int_value;
2947
+ target = ovrd->val_i64;
2932
2948
  return true;
2933
2949
  }
2934
2950
  return false;
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
2938
2954
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2939
2955
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2940
2956
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2941
- target = ovrd->float_value;
2957
+ target = ovrd->val_f64;
2942
2958
  return true;
2943
2959
  }
2944
2960
  return false;
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
2947
2963
  template<typename OT>
2948
2964
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2949
2965
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2950
- (void)target;
2951
- (void)ovrd;
2952
- if (!ovrd) { return false; }
2953
- // Currently, we should never end up here so it would be a bug if we do.
2954
- throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2955
- ovrd ? ovrd->key : "NULL"));
2966
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
2967
+ target = ovrd->val_str;
2968
+ return true;
2969
+ }
2970
+ return false;
2956
2971
  }
2957
2972
 
2958
2973
  static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
2985
3000
  size_t n_bytes = 0;
2986
3001
 
2987
3002
  bool use_mmap = false;
3003
+ bool check_tensors;
2988
3004
 
2989
3005
  llama_files files;
2990
3006
  llama_ftype ftype;
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
3018
3034
  std::string arch_name;
3019
3035
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
3020
3036
 
3021
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
3037
+ llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
3022
3038
  int trace = 0;
3023
3039
  if (getenv("LLAMA_TRACE")) {
3024
3040
  trace = atoi(getenv("LLAMA_TRACE"));
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
3115
3131
 
3116
3132
  fver = (enum llama_fver) gguf_get_version(meta);
3117
3133
 
3134
+ std::set<std::string> tensor_names;
3118
3135
  for (auto & w : weights) {
3119
3136
  n_elements += ggml_nelements(w.tensor);
3120
3137
  n_bytes += ggml_nbytes(w.tensor);
3138
+ // make sure there is no duplicated tensor names
3139
+ const std::string name(w.tensor->name);
3140
+ auto found = tensor_names.find(name);
3141
+ if (found != tensor_names.end()) {
3142
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
3143
+ }
3144
+ tensor_names.insert(name);
3121
3145
  }
3122
3146
 
3123
3147
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -3223,6 +3247,7 @@ struct llama_model_loader {
3223
3247
  }
3224
3248
 
3225
3249
  this->use_mmap = use_mmap;
3250
+ this->check_tensors = check_tensors;
3226
3251
  }
3227
3252
 
3228
3253
  ~llama_model_loader() {
@@ -3481,6 +3506,10 @@ struct llama_model_loader {
3481
3506
  file->seek(w.offs, SEEK_SET);
3482
3507
  file->read_raw(cur->data, ggml_nbytes(cur));
3483
3508
  }
3509
+
3510
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
3511
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3512
+ }
3484
3513
  }
3485
3514
 
3486
3515
  size_t size_done = 0;
@@ -3497,6 +3526,8 @@ struct llama_model_loader {
3497
3526
  GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3498
3527
 
3499
3528
  std::vector<no_init<uint8_t>> read_buf;
3529
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3530
+
3500
3531
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3501
3532
  const auto * weight = get_weight(ggml_get_name(cur));
3502
3533
  if (weight == nullptr) {
@@ -3518,37 +3549,66 @@ struct llama_model_loader {
3518
3549
  if (bufs_mmap.count(weight->idx)) {
3519
3550
  buf_mmap = bufs_mmap.at(weight->idx);
3520
3551
  }
3552
+ uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
3553
+
3554
+ if (check_tensors) {
3555
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
3556
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
3557
+ }));
3558
+ }
3559
+
3521
3560
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3522
3561
  if (buf_mmap && cur->data == nullptr) {
3523
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3562
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
3524
3563
  if (lmlocks) {
3525
3564
  const auto & lmlock = lmlocks->at(weight->idx);
3526
- lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3565
+ lmlock->grow_to(weight->offs + n_size);
3527
3566
  }
3528
3567
 
3529
3568
  auto & mmap_used = mmaps_used[weight->idx];
3530
3569
  mmap_used.first = std::min(mmap_used.first, weight->offs);
3531
3570
  mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3532
3571
  } else {
3533
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3572
+ ggml_backend_tensor_set(cur, data, 0, n_size);
3534
3573
  }
3535
3574
  } else {
3536
3575
  GGML_ASSERT(weight->idx < files.size());
3537
3576
  const auto & file = files.at(weight->idx);
3538
3577
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3539
3578
  file->seek(weight->offs, SEEK_SET);
3540
- file->read_raw(cur->data, ggml_nbytes(cur));
3579
+ file->read_raw(cur->data, n_size);
3580
+ if (check_tensors) {
3581
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3582
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
3583
+ }));
3584
+ }
3541
3585
  } else {
3542
- read_buf.resize(ggml_nbytes(cur));
3586
+ read_buf.resize(n_size);
3543
3587
  file->seek(weight->offs, SEEK_SET);
3544
- file->read_raw(read_buf.data(), ggml_nbytes(cur));
3588
+ file->read_raw(read_buf.data(), n_size);
3545
3589
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3590
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3591
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3592
+ }
3546
3593
  }
3547
3594
  }
3548
3595
 
3549
3596
  size_done += n_size;
3550
3597
  }
3551
3598
 
3599
+ // check validation results
3600
+ bool validation_failed = false;
3601
+ for (auto & future : validation_result) {
3602
+ auto result = future.get();
3603
+ if (!result.second) {
3604
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
3605
+ validation_failed = true;
3606
+ }
3607
+ }
3608
+ if (validation_failed) {
3609
+ throw std::runtime_error("found tensors with invalid data");
3610
+ }
3611
+
3552
3612
  // check if this is the last call and do final cleanup
3553
3613
  if (size_done >= size_data) {
3554
3614
  // unmap offloaded tensors and metadata
@@ -4142,7 +4202,7 @@ static void llm_load_hparams(
4142
4202
  model.ftype = ml.ftype;
4143
4203
 
4144
4204
  if (hparams.f_max_alibi_bias > 0.0f) {
4145
- hparams.need_kq_pos = true;
4205
+ hparams.use_alibi = true;
4146
4206
  }
4147
4207
 
4148
4208
  hparams.rope_type = llama_rope_type(&model);
@@ -4165,11 +4225,13 @@ static void llm_load_vocab(
4165
4225
 
4166
4226
  // determine vocab type
4167
4227
  {
4168
- std::string tokenizer_name;
4228
+ std::string tokenizer_model;
4229
+ std::string tokenizer_pre;
4169
4230
 
4170
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
4231
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
4232
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4171
4233
 
4172
- if (tokenizer_name == "no_vocab") {
4234
+ if (tokenizer_model == "no_vocab") {
4173
4235
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
4174
4236
 
4175
4237
  // default special tokens
@@ -4183,7 +4245,7 @@ static void llm_load_vocab(
4183
4245
  vocab.linefeed_id = -1;
4184
4246
 
4185
4247
  return;
4186
- } else if (tokenizer_name == "llama") {
4248
+ } else if (tokenizer_model == "llama") {
4187
4249
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4188
4250
 
4189
4251
  // default special tokens
@@ -4228,9 +4290,27 @@ static void llm_load_vocab(
4228
4290
  if (add_space_prefix_keyidx != -1) {
4229
4291
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4230
4292
  } // The default value of add_space_prefix is true.
4231
- } else if (tokenizer_name == "gpt2") {
4232
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4293
+ } else if (tokenizer_model == "bert") {
4294
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4233
4295
 
4296
+ // default special tokens
4297
+ vocab.special_bos_id = -1;
4298
+ vocab.special_eos_id = -1;
4299
+ vocab.special_unk_id = 100;
4300
+ vocab.special_sep_id = 102;
4301
+ vocab.special_pad_id = 0;
4302
+ vocab.special_cls_id = 101;
4303
+ vocab.special_mask_id = 103;
4304
+ vocab.add_space_prefix = false;
4305
+ } else {
4306
+ if (tokenizer_model == "gpt2") {
4307
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4308
+ } else {
4309
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4310
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4311
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
4312
+ return;
4313
+ }
4234
4314
  // read bpe merges and populate bpe ranks
4235
4315
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4236
4316
  if (merges_keyidx == -1) {
@@ -4264,23 +4344,50 @@ static void llm_load_vocab(
4264
4344
  vocab.special_pad_id = -1;
4265
4345
  vocab.special_cls_id = -1;
4266
4346
  vocab.special_mask_id = -1;
4267
- } else if (tokenizer_name == "bert") {
4268
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4347
+ }
4269
4348
 
4270
- // default special tokens
4271
- vocab.special_bos_id = -1;
4272
- vocab.special_eos_id = -1;
4273
- vocab.special_unk_id = 100;
4274
- vocab.special_sep_id = 102;
4275
- vocab.special_pad_id = 0;
4276
- vocab.special_cls_id = 101;
4277
- vocab.special_mask_id = 103;
4278
- vocab.add_space_prefix = false;
4349
+ // for now, only BPE models have pre-tokenizers
4350
+ if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4351
+ if (tokenizer_pre.empty()) {
4352
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4353
+ LLAMA_LOG_WARN("%s: \n", __func__);
4354
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4355
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4356
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4357
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4358
+ LLAMA_LOG_WARN("%s: \n", __func__);
4359
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4360
+ } else if (
4361
+ tokenizer_pre == "default") {
4362
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4363
+ } else if (
4364
+ tokenizer_pre == "llama3" ||
4365
+ tokenizer_pre == "llama-v3" ||
4366
+ tokenizer_pre == "llama-bpe") {
4367
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4368
+ } else if (
4369
+ tokenizer_pre == "deepseek-llm") {
4370
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4371
+ } else if (
4372
+ tokenizer_pre == "deepseek-coder") {
4373
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4374
+ } else if (
4375
+ tokenizer_pre == "falcon") {
4376
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4377
+ } else if (
4378
+ tokenizer_pre == "mpt") {
4379
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4380
+ } else if (
4381
+ tokenizer_pre == "starcoder") {
4382
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4383
+ } else if (
4384
+ tokenizer_pre == "gpt-2") {
4385
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386
+ } else {
4387
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4388
+ }
4279
4389
  } else {
4280
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4281
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4282
-
4283
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4284
4391
  }
4285
4392
  }
4286
4393
 
@@ -5975,7 +6082,7 @@ static bool llm_load_tensors(
5975
6082
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
5976
6083
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
5977
6084
  try {
5978
- llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
6085
+ llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
5979
6086
 
5980
6087
  model.hparams.vocab_only = params.vocab_only;
5981
6088
 
@@ -6104,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
6104
6211
  static void llm_build_kv_store(
6105
6212
  struct ggml_context * ctx,
6106
6213
  const llama_hparams & hparams,
6214
+ const llama_cparams & cparams,
6107
6215
  const llama_kv_cache & kv,
6108
6216
  struct ggml_cgraph * graph,
6109
6217
  struct ggml_tensor * k_cur,
6110
6218
  struct ggml_tensor * v_cur,
6111
- int64_t n_ctx,
6112
6219
  int32_t n_tokens,
6113
6220
  int32_t kv_head,
6114
6221
  const llm_build_cb & cb,
6115
6222
  int64_t il) {
6223
+ const int64_t n_ctx = cparams.n_ctx;
6224
+
6116
6225
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6117
6226
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6118
6227
 
6119
6228
  GGML_ASSERT(kv.size == n_ctx);
6120
6229
 
6121
- // compute the transposed [n_tokens, n_embd] V matrix
6122
- assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6123
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
6124
- cb(v_cur_t, "v_cur_t", il);
6125
-
6126
6230
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
6127
6231
  (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
6128
6232
  cb(k_cache_view, "k_cache_view", il);
6129
6233
 
6130
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6131
- ( n_ctx)*ggml_element_size(kv.v_l[il]),
6132
- (kv_head)*ggml_element_size(kv.v_l[il]));
6234
+ // note: storing RoPE-ed version of K in the KV cache
6235
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6236
+
6237
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6238
+
6239
+ struct ggml_tensor * v_cache_view = nullptr;
6240
+
6241
+ if (cparams.flash_attn) {
6242
+ v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
6243
+ (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
6244
+ } else {
6245
+ // note: the V cache is transposed when not using flash attention
6246
+ v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6247
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
6248
+ (kv_head)*ggml_element_size(kv.v_l[il]));
6249
+
6250
+ v_cur = ggml_transpose(ctx, v_cur);
6251
+ }
6133
6252
  cb(v_cache_view, "v_cache_view", il);
6134
6253
 
6135
- // important: storing RoPE-ed version of K in the KV cache!
6136
- ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6137
- ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
6254
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
6138
6255
  }
6139
6256
 
6140
6257
  static struct ggml_tensor * llm_build_norm(
@@ -6354,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
6354
6471
  return moe_out;
6355
6472
  }
6356
6473
 
6357
- // if max_alibi_bias > 0 then apply ALiBi
6358
6474
  static struct ggml_tensor * llm_build_kqv(
6359
6475
  struct ggml_context * ctx,
6360
6476
  const llama_model & model,
6361
6477
  const llama_hparams & hparams,
6478
+ const llama_cparams & cparams,
6362
6479
  const llama_kv_cache & kv,
6363
6480
  struct ggml_cgraph * graph,
6364
6481
  struct ggml_tensor * wo,
@@ -6366,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
6366
6483
  struct ggml_tensor * q_cur,
6367
6484
  struct ggml_tensor * kq_mask,
6368
6485
  struct ggml_tensor * kq_pos,
6369
- int64_t n_ctx,
6370
6486
  int32_t n_tokens,
6371
6487
  int32_t n_kv,
6372
6488
  float kq_scale,
6373
6489
  const llm_build_cb & cb,
6374
6490
  int il) {
6491
+ const int64_t n_ctx = cparams.n_ctx;
6375
6492
  const int64_t n_head = hparams.n_head;
6376
6493
  const int64_t n_head_kv = hparams.n_head_kv;
6377
6494
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -6389,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
6389
6506
  0);
6390
6507
  cb(k, "k", il);
6391
6508
 
6392
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6393
- cb(kq, "kq", il);
6509
+ struct ggml_tensor * cur;
6394
6510
 
6395
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6396
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6397
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6398
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6399
- }
6511
+ if (cparams.flash_attn) {
6512
+ GGML_UNUSED(model);
6513
+ GGML_UNUSED(n_ctx);
6400
6514
 
6401
- if (model.arch == LLM_ARCH_GROK) {
6402
- // need to do the following:
6403
- // multiply by attn_output_multiplyer of 0.08838834764831845
6404
- // and then :
6405
- // kq = 30 * tanh(kq / 30)
6406
- // before the softmax below
6515
+ // note: if this assert triggers, then some check has failed earlier
6516
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6517
+ GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6407
6518
 
6408
- //try from phi2
6409
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6519
+ // split cached v into n_head heads (not transposed)
6520
+ struct ggml_tensor * v =
6521
+ ggml_view_3d(ctx, kv.v_l[il],
6522
+ n_embd_head_v, n_kv, n_head_kv,
6523
+ ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6524
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6525
+ 0);
6526
+ cb(v, "v", il);
6410
6527
 
6411
- kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6412
- kq = ggml_scale(ctx, kq, 30);
6413
- }
6528
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6529
+
6530
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6531
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6532
+ }
6533
+
6534
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6535
+ } else {
6536
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6537
+ cb(kq, "kq", il);
6538
+
6539
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6540
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6541
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6542
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6543
+ }
6544
+
6545
+ if (model.arch == LLM_ARCH_GROK) {
6546
+ // need to do the following:
6547
+ // multiply by attn_output_multiplyer of 0.08838834764831845
6548
+ // and then :
6549
+ // kq = 30 * tanh(kq / 30)
6550
+ // before the softmax below
6551
+
6552
+ //try from phi2
6553
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6554
+
6555
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6556
+ kq = ggml_scale(ctx, kq, 30);
6557
+ }
6414
6558
 
6415
6559
  #if defined(GGML_USE_KOMPUTE)
6416
6560
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6417
6561
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6418
6562
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6419
- if (hparams.f_max_alibi_bias > 0.0f) {
6420
- kq = ggml_scale(ctx, kq, kq_scale);
6421
- cb(kq, "kq_scaled", il);
6563
+ if (hparams.use_alibi) {
6564
+ kq = ggml_scale(ctx, kq, kq_scale);
6565
+ cb(kq, "kq_scaled", il);
6422
6566
 
6423
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6424
- cb(kq, "kq_scaled_alibi", il);
6567
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6568
+ cb(kq, "kq_scaled_alibi", il);
6425
6569
 
6426
- kq = ggml_add(ctx, kq, kq_mask);
6427
- cb(kq, "kq_masked", il);
6570
+ kq = ggml_add(ctx, kq, kq_mask);
6571
+ cb(kq, "kq_masked", il);
6428
6572
 
6429
- kq = ggml_soft_max(ctx, kq);
6430
- cb(kq, "kq_soft_max", il);
6431
- } else
6573
+ kq = ggml_soft_max(ctx, kq);
6574
+ cb(kq, "kq_soft_max", il);
6575
+ } else
6432
6576
  #endif
6433
- {
6434
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6435
- cb(kq, "kq_soft_max_ext", il);
6436
- }
6577
+ {
6578
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6579
+ cb(kq, "kq_soft_max_ext", il);
6580
+ }
6437
6581
 
6438
- GGML_ASSERT(kv.size == n_ctx);
6582
+ GGML_ASSERT(kv.size == n_ctx);
6439
6583
 
6440
- // split cached v into n_head heads
6441
- struct ggml_tensor * v =
6442
- ggml_view_3d(ctx, kv.v_l[il],
6443
- n_kv, n_embd_head_v, n_head_kv,
6444
- ggml_element_size(kv.v_l[il])*n_ctx,
6445
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6446
- 0);
6447
- cb(v, "v", il);
6584
+ // split cached v into n_head heads
6585
+ struct ggml_tensor * v =
6586
+ ggml_view_3d(ctx, kv.v_l[il],
6587
+ n_kv, n_embd_head_v, n_head_kv,
6588
+ ggml_element_size(kv.v_l[il])*n_ctx,
6589
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6590
+ 0);
6591
+ cb(v, "v", il);
6448
6592
 
6449
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6450
- cb(kqv, "kqv", il);
6593
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6594
+ cb(kqv, "kqv", il);
6451
6595
 
6452
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6453
- cb(kqv_merged, "kqv_merged", il);
6596
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6597
+ cb(kqv_merged, "kqv_merged", il);
6454
6598
 
6455
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6456
- cb(cur, "kqv_merged_cont", il);
6599
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6600
+ cb(cur, "kqv_merged_cont", il);
6601
+ }
6457
6602
 
6458
6603
  ggml_build_forward_expand(graph, cur);
6459
6604
 
@@ -6473,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
6473
6618
  struct ggml_context * ctx,
6474
6619
  const llama_model & model,
6475
6620
  const llama_hparams & hparams,
6621
+ const llama_cparams & cparams,
6476
6622
  const llama_kv_cache & kv,
6477
6623
  struct ggml_cgraph * graph,
6478
6624
  struct ggml_tensor * wo,
@@ -6482,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
6482
6628
  struct ggml_tensor * q_cur,
6483
6629
  struct ggml_tensor * kq_mask,
6484
6630
  struct ggml_tensor * kq_pos,
6485
- int64_t n_ctx,
6486
6631
  int32_t n_tokens,
6487
6632
  int32_t kv_head,
6488
6633
  int32_t n_kv,
@@ -6496,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
6496
6641
  ggml_build_forward_expand(graph, k_cur);
6497
6642
  ggml_build_forward_expand(graph, v_cur);
6498
6643
 
6499
- llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
6644
+ llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
6500
6645
 
6501
6646
  struct ggml_tensor * cur;
6502
6647
 
6503
- cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
6504
- q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
6648
+ cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6649
+ q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6505
6650
  cb(cur, "kqv_out", il);
6506
6651
 
6507
6652
  return cur;
@@ -6543,6 +6688,8 @@ struct llm_build_context {
6543
6688
  const int32_t kv_head; // index of where we store new KV data in the cache
6544
6689
  const int32_t n_orig_ctx;
6545
6690
 
6691
+ const bool flash_attn;
6692
+
6546
6693
  const enum llama_pooling_type pooling_type;
6547
6694
  const enum llama_rope_type rope_type;
6548
6695
 
@@ -6589,6 +6736,7 @@ struct llm_build_context {
6589
6736
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6590
6737
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6591
6738
  n_orig_ctx (cparams.n_yarn_orig_ctx),
6739
+ flash_attn (cparams.flash_attn),
6592
6740
  pooling_type (cparams.pooling_type),
6593
6741
  rope_type (hparams.rope_type),
6594
6742
  cb (cb),
@@ -6703,15 +6851,31 @@ struct llm_build_context {
6703
6851
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6704
6852
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
6705
6853
 
6706
- ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6707
- nm, n_embd_v_gqa,
6708
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6709
- ggml_row_size(kv_self.v_l[il]->type, i));
6854
+ ggml_tensor * view_v_src;
6855
+ ggml_tensor * view_v_dst;
6710
6856
 
6711
- ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6712
- nm, n_embd_v_gqa,
6713
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6714
- ggml_row_size(kv_self.v_l[il]->type, id));
6857
+ if (flash_attn) {
6858
+ // NOTE: the V cache is not transposed when using flash attention
6859
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6860
+ n_embd_v_gqa, nm,
6861
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6862
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
6863
+
6864
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6865
+ n_embd_v_gqa, nm,
6866
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6867
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
6868
+ } else {
6869
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6870
+ nm, n_embd_v_gqa,
6871
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6872
+ ggml_row_size(kv_self.v_l[il]->type, i));
6873
+
6874
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6875
+ nm, n_embd_v_gqa,
6876
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6877
+ ggml_row_size(kv_self.v_l[il]->type, id));
6878
+ }
6715
6879
 
6716
6880
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
6717
6881
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
@@ -6741,20 +6905,26 @@ struct llm_build_context {
6741
6905
 
6742
6906
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
6743
6907
  if (causal) {
6744
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
6908
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6745
6909
  } else {
6746
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6910
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6747
6911
  }
6748
6912
  cb(lctx.inp_KQ_mask, "KQ_mask", -1);
6749
6913
  ggml_set_input(lctx.inp_KQ_mask);
6750
- return lctx.inp_KQ_mask;
6914
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6751
6915
  }
6752
6916
 
6753
- struct ggml_tensor * build_inp_KQ_pos() {
6754
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6917
+ struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6918
+ if (causal) {
6919
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6920
+ } else {
6921
+ // TODO: this will be needed for ALiBi-based BERT models
6922
+ // https://github.com/ggerganov/llama.cpp/pull/6826
6923
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6924
+ }
6755
6925
  cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6756
6926
  ggml_set_input(lctx.inp_KQ_pos);
6757
- return lctx.inp_KQ_pos;
6927
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6758
6928
  }
6759
6929
 
6760
6930
  struct ggml_tensor * build_inp_mean() {
@@ -6860,9 +7030,9 @@ struct llm_build_context {
6860
7030
  );
6861
7031
  cb(Kcur, "Kcur", il);
6862
7032
 
6863
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7033
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6864
7034
  model.layers[il].wo, model.layers[il].bo,
6865
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7035
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6866
7036
  }
6867
7037
 
6868
7038
  if (il == n_layer - 1) {
@@ -7000,9 +7170,9 @@ struct llm_build_context {
7000
7170
  cb(Qcur, "Qcur", il);
7001
7171
  cb(Kcur, "Kcur", il);
7002
7172
 
7003
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7173
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7004
7174
  model.layers[il].wo, NULL,
7005
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7175
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7006
7176
  }
7007
7177
 
7008
7178
  if (il == n_layer - 1) {
@@ -7107,9 +7277,9 @@ struct llm_build_context {
7107
7277
  ext_factor, attn_factor, beta_fast, beta_slow
7108
7278
  );
7109
7279
  cb(Kcur, "Kcur", il);
7110
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7111
7281
  model.layers[il].wo, NULL,
7112
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7282
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7113
7283
  }
7114
7284
 
7115
7285
  if (il == n_layer - 1) {
@@ -7227,9 +7397,9 @@ struct llm_build_context {
7227
7397
  );
7228
7398
  cb(Kcur, "Kcur", il);
7229
7399
 
7230
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7400
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7231
7401
  model.layers[il].wo, NULL,
7232
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7402
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7233
7403
  }
7234
7404
 
7235
7405
  if (il == n_layer - 1) {
@@ -7352,9 +7522,9 @@ struct llm_build_context {
7352
7522
  );
7353
7523
  cb(Kcur, "Kcur", il);
7354
7524
 
7355
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7525
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7356
7526
  model.layers[il].wo, model.layers[il].bo,
7357
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7527
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7358
7528
  }
7359
7529
 
7360
7530
  if (il == n_layer - 1) {
@@ -7504,9 +7674,9 @@ struct llm_build_context {
7504
7674
  );
7505
7675
  cb(Kcur, "Kcur", il);
7506
7676
 
7507
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
- model.layers[il].wo, NULL,
7509
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7677
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7678
+ model.layers[il].wo, NULL,
7679
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7510
7680
  }
7511
7681
 
7512
7682
  if (il == n_layer - 1) {
@@ -7616,9 +7786,9 @@ struct llm_build_context {
7616
7786
 
7617
7787
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7618
7788
 
7619
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7789
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7620
7790
  model.layers[il].wo, model.layers[il].bo,
7621
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7791
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7622
7792
  }
7623
7793
 
7624
7794
  if (il == n_layer - 1) {
@@ -7820,9 +7990,9 @@ struct llm_build_context {
7820
7990
  );
7821
7991
  cb(Vcur, "Vcur", il);
7822
7992
 
7823
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7993
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7824
7994
  model.layers[il].wo, model.layers[il].bo,
7825
- Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7995
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7826
7996
  }
7827
7997
 
7828
7998
  if (il == n_layer - 1) {
@@ -7916,9 +8086,9 @@ struct llm_build_context {
7916
8086
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7917
8087
  cb(Qcur, "Qcur", il);
7918
8088
 
7919
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8089
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7920
8090
  model.layers[il].wo, NULL,
7921
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8091
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7922
8092
  }
7923
8093
 
7924
8094
  if (il == n_layer - 1) {
@@ -8209,9 +8379,9 @@ struct llm_build_context {
8209
8379
 
8210
8380
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8211
8381
 
8212
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8382
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8213
8383
  model.layers[il].wo, model.layers[il].bo,
8214
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8384
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8215
8385
  }
8216
8386
 
8217
8387
  if (il == n_layer - 1) {
@@ -8340,14 +8510,15 @@ struct llm_build_context {
8340
8510
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8341
8511
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8342
8512
 
8343
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8344
- model.layers[il].wo, model.layers[il].bo,
8345
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8513
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8514
+ model.layers[il].wo, model.layers[il].bo,
8515
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8346
8516
  } else {
8347
8517
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8348
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8518
+
8519
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8349
8520
  model.layers[il].wo, model.layers[il].bo,
8350
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8521
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8351
8522
  }
8352
8523
  }
8353
8524
 
@@ -8489,9 +8660,9 @@ struct llm_build_context {
8489
8660
  );
8490
8661
  cb(Kcur, "Kcur", il);
8491
8662
 
8492
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8663
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8493
8664
  model.layers[il].wo, NULL,
8494
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8665
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8495
8666
  }
8496
8667
 
8497
8668
  if (il == n_layer - 1) {
@@ -8607,9 +8778,9 @@ struct llm_build_context {
8607
8778
  );
8608
8779
  cb(Kcur, "Kcur", il);
8609
8780
 
8610
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8781
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8611
8782
  model.layers[il].wo, NULL,
8612
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8783
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
8784
  }
8614
8785
 
8615
8786
  if (il == n_layer - 1) {
@@ -8720,9 +8891,9 @@ struct llm_build_context {
8720
8891
  );
8721
8892
  cb(Kcur, "Kcur", il);
8722
8893
 
8723
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8894
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8724
8895
  model.layers[il].wo, model.layers[il].bo,
8725
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8896
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8726
8897
  }
8727
8898
 
8728
8899
  if (il == n_layer - 1) {
@@ -8834,9 +9005,9 @@ struct llm_build_context {
8834
9005
  );
8835
9006
  cb(Kcur, "Kcur", il);
8836
9007
 
8837
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9008
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8838
9009
  model.layers[il].wo, model.layers[il].bo,
8839
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9010
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8840
9011
  }
8841
9012
 
8842
9013
  if (il == n_layer - 1) {
@@ -8989,9 +9160,9 @@ struct llm_build_context {
8989
9160
  );
8990
9161
  cb(Kcur, "Kcur", il);
8991
9162
 
8992
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9163
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8993
9164
  model.layers[il].wo, model.layers[il].bo,
8994
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9165
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8995
9166
  }
8996
9167
 
8997
9168
  if (il == n_layer - 1) {
@@ -9106,9 +9277,9 @@ struct llm_build_context {
9106
9277
  );
9107
9278
  cb(Kcur, "Kcur", il);
9108
9279
 
9109
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9110
- model.layers[il].wo, NULL,
9111
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9281
+ model.layers[il].wo, model.layers[il].bo,
9282
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9112
9283
  }
9113
9284
 
9114
9285
  if (il == n_layer - 1) {
@@ -9219,9 +9390,9 @@ struct llm_build_context {
9219
9390
  ext_factor, attn_factor, beta_fast, beta_slow);
9220
9391
  cb(Kcur, "Kcur", il);
9221
9392
 
9222
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9393
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9223
9394
  model.layers[il].wo, NULL,
9224
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9395
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9225
9396
  }
9226
9397
  struct ggml_tensor * sa_out = cur;
9227
9398
 
@@ -9322,9 +9493,9 @@ struct llm_build_context {
9322
9493
 
9323
9494
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9324
9495
 
9325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9326
9497
  model.layers[il].wo, model.layers[il].bo,
9327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9328
9499
  }
9329
9500
 
9330
9501
  if (il == n_layer - 1) {
@@ -9429,9 +9600,9 @@ struct llm_build_context {
9429
9600
  );
9430
9601
  cb(Kcur, "Kcur", il);
9431
9602
 
9432
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9603
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9433
9604
  model.layers[il].wo, model.layers[il].bo,
9434
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9605
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9435
9606
  }
9436
9607
 
9437
9608
  if (il == n_layer - 1) {
@@ -9545,9 +9716,9 @@ struct llm_build_context {
9545
9716
  );
9546
9717
  cb(Kcur, "Kcur", il);
9547
9718
 
9548
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9719
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9549
9720
  model.layers[il].wo, NULL,
9550
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9721
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9551
9722
  }
9552
9723
 
9553
9724
  if (il == n_layer - 1) {
@@ -9662,9 +9833,9 @@ struct llm_build_context {
9662
9833
  );
9663
9834
  cb(Kcur, "Kcur", il);
9664
9835
 
9665
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9836
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9666
9837
  model.layers[il].wo, model.layers[il].bo,
9667
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9838
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9668
9839
  }
9669
9840
 
9670
9841
  if (il == n_layer - 1) {
@@ -9792,9 +9963,9 @@ struct llm_build_context {
9792
9963
  );
9793
9964
  cb(Kcur, "Kcur", il);
9794
9965
 
9795
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9966
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9796
9967
  model.layers[il].wo, model.layers[il].bo,
9797
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9968
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9798
9969
  }
9799
9970
 
9800
9971
  if (il == n_layer - 1) {
@@ -9913,9 +10084,9 @@ struct llm_build_context {
9913
10084
  ext_factor, attn_factor, beta_fast, beta_slow);
9914
10085
  cb(Kcur, "Kcur", il);
9915
10086
 
9916
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10087
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9917
10088
  model.layers[il].wo, NULL,
9918
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10089
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9919
10090
  }
9920
10091
 
9921
10092
  if (il == n_layer - 1) {
@@ -10032,9 +10203,9 @@ struct llm_build_context {
10032
10203
  );
10033
10204
  cb(Kcur, "Kcur", il);
10034
10205
 
10035
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10206
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10036
10207
  model.layers[il].wo, model.layers[il].bo,
10037
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10208
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10038
10209
  }
10039
10210
 
10040
10211
  if (il == n_layer - 1) {
@@ -10322,9 +10493,9 @@ struct llm_build_context {
10322
10493
  );
10323
10494
  cb(Kcur, "Kcur", il);
10324
10495
 
10325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10326
10497
  model.layers[il].wo, model.layers[il].bo,
10327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10328
10499
  }
10329
10500
 
10330
10501
  if (il == n_layer - 1) {
@@ -10453,9 +10624,9 @@ struct llm_build_context {
10453
10624
  );
10454
10625
  cb(Kcur, "Kcur", il);
10455
10626
 
10456
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10627
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10457
10628
  model.layers[il].wo, nullptr,
10458
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10629
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10459
10630
  }
10460
10631
 
10461
10632
  if (il == n_layer - 1) {
@@ -10882,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
10882
11053
  }
10883
11054
  }
10884
11055
 
10885
- if (hparams.need_kq_pos) {
11056
+ // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11057
+ // this allows to process multiple sequences in parallel with ALiBi-based models
11058
+ if (hparams.use_alibi) {
10886
11059
  const int64_t n_kv = kv_self.n;
10887
11060
 
10888
11061
  GGML_ASSERT(lctx.inp_KQ_pos);
@@ -11264,7 +11437,7 @@ static int llama_decode_internal(
11264
11437
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11265
11438
  // after enough generations, the benefit from this heuristic disappears
11266
11439
  // if we start defragmenting the cache, the benefit from this will be more important
11267
- kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
11440
+ kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11268
11441
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11269
11442
  }
11270
11443
  }
@@ -11432,6 +11605,10 @@ static int llama_decode_internal(
11432
11605
  }
11433
11606
  }
11434
11607
 
11608
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
11609
+ // overlap with device computation.
11610
+ ggml_backend_sched_reset(lctx.sched);
11611
+
11435
11612
  return 0;
11436
11613
  }
11437
11614
 
@@ -11457,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
11457
11634
  // each move requires 6*n_layer tensors (see build_defrag)
11458
11635
  // - source view, destination view, copy operation
11459
11636
  // - x2 for keys and values
11460
- const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11637
+ //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11638
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11639
+ const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
11461
11640
 
11462
11641
  // determine which KV cells to move where
11463
11642
  //
@@ -11781,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11781
11960
  }
11782
11961
  case LLAMA_VOCAB_TYPE_BPE: {
11783
11962
  GGML_ASSERT(false);
11784
- return unicode_utf8_to_byte(token_data.text);
11963
+ return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
11785
11964
  }
11786
11965
  case LLAMA_VOCAB_TYPE_WPM: {
11787
11966
  GGML_ASSERT(false);
@@ -12003,7 +12182,79 @@ struct llm_tokenizer_bpe {
12003
12182
 
12004
12183
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12005
12184
  int final_prev_index = -1;
12006
- auto word_collection = bpe_gpt2_preprocess(text);
12185
+
12186
+ std::vector<std::string> word_collection;
12187
+ switch (vocab.type) {
12188
+ case LLAMA_VOCAB_TYPE_BPE:
12189
+ switch (vocab.type_pre) {
12190
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12191
+ word_collection = unicode_regex_split(text, {
12192
+ // original regex from tokenizer.json
12193
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12194
+
12195
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12196
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12197
+ });
12198
+ break;
12199
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12200
+ word_collection = unicode_regex_split(text, {
12201
+ "[\r\n]",
12202
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12203
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12204
+ "\\s+$",
12205
+ "[一-龥ࠀ-一가-퟿]+",
12206
+ "\\p{N}+",
12207
+ });
12208
+ break;
12209
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12210
+ word_collection = unicode_regex_split(text, {
12211
+ "[\r\n]",
12212
+ "\\s?\\p{L}+",
12213
+ "\\s?\\p{P}+",
12214
+ "[一-龥ࠀ-一가-퟿]+",
12215
+ "\\p{N}+",
12216
+ });
12217
+ break;
12218
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
12219
+ word_collection = unicode_regex_split(text, {
12220
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12221
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222
+ "\\p{N}+",
12223
+ "[0-9][0-9][0-9]",
12224
+ });
12225
+ break;
12226
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
12227
+ // TODO: MPT pre-tokenization regexes are unknown
12228
+ // the following are close, but not exact. run the following:
12229
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12230
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12231
+ word_collection = unicode_regex_split(text, {
12232
+ "\\s?\\p{L}+",
12233
+ "\\s?\\p{P}+",
12234
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12235
+ });
12236
+ break;
12237
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12238
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
12239
+ word_collection = unicode_regex_split(text, {
12240
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
+ });
12242
+ break;
12243
+ default:
12244
+ // default regex for BPE tokenization pre-processing
12245
+ word_collection = unicode_regex_split(text, {
12246
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12247
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12248
+ "\\p{N}+",
12249
+ "[0-9][0-9][0-9]",
12250
+ });
12251
+ break;
12252
+ }
12253
+ break;
12254
+ default:
12255
+ GGML_ASSERT(false);
12256
+ break;
12257
+ }
12007
12258
 
12008
12259
  symbols_final.clear();
12009
12260
 
@@ -12130,145 +12381,6 @@ private:
12130
12381
  work_queue.push(bigram);
12131
12382
  }
12132
12383
 
12133
- std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
12134
- std::vector<std::string> bpe_words;
12135
- std::vector<std::string> bpe_encoded_words;
12136
-
12137
- std::string token = "";
12138
- // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
12139
- bool collecting_numeric = false;
12140
- bool collecting_letter = false;
12141
- bool collecting_special = false;
12142
- bool collecting_whitespace_lookahead = false;
12143
- bool collecting = false;
12144
-
12145
- std::vector<std::string> text_utf;
12146
- text_utf.reserve(text.size());
12147
- bpe_words.reserve(text.size());
12148
- bpe_encoded_words.reserve(text.size());
12149
-
12150
- const auto cpts = unicode_cpts_from_utf8(text);
12151
- for (size_t i = 0; i < cpts.size(); ++i)
12152
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
12153
-
12154
- for (int i = 0; i < (int)text_utf.size(); i++) {
12155
- const std::string & utf_char = text_utf[i];
12156
- bool split_condition = false;
12157
- int bytes_remain = text_utf.size() - i;
12158
- // forward backward lookups
12159
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
12160
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
12161
-
12162
- // handling contractions
12163
- if (!split_condition && bytes_remain >= 2) {
12164
- // 's|'t|'m|'d
12165
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
12166
- split_condition = true;
12167
- }
12168
- if (split_condition) {
12169
- if (token.size()) {
12170
- bpe_words.emplace_back(token); // push previous content as token
12171
- }
12172
- token = utf_char + utf_char_next;
12173
- bpe_words.emplace_back(token);
12174
- token = "";
12175
- i++;
12176
- continue;
12177
- }
12178
- }
12179
- if (!split_condition && bytes_remain >= 3) {
12180
- // 're|'ve|'ll
12181
- if (utf_char == "\'" && (
12182
- (utf_char_next == "r" && utf_char_next_next == "e") ||
12183
- (utf_char_next == "v" && utf_char_next_next == "e") ||
12184
- (utf_char_next == "l" && utf_char_next_next == "l"))
12185
- ) {
12186
- split_condition = true;
12187
- }
12188
- if (split_condition) {
12189
- // current token + next token can be defined
12190
- if (token.size()) {
12191
- bpe_words.emplace_back(token); // push previous content as token
12192
- }
12193
- token = utf_char + utf_char_next + utf_char_next_next;
12194
- bpe_words.emplace_back(token); // the contraction
12195
- token = "";
12196
- i += 2;
12197
- continue;
12198
- }
12199
- }
12200
-
12201
- if (!split_condition && !collecting) {
12202
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
12203
- collecting_letter = true;
12204
- collecting = true;
12205
- }
12206
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12207
- collecting_numeric = true;
12208
- collecting = true;
12209
- }
12210
- else if (
12211
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
12212
- (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
12213
- ) {
12214
- collecting_special = true;
12215
- collecting = true;
12216
- }
12217
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
12218
- collecting_whitespace_lookahead = true;
12219
- collecting = true;
12220
- }
12221
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
12222
- split_condition = true;
12223
- }
12224
- }
12225
- else if (!split_condition && collecting) {
12226
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
12227
- split_condition = true;
12228
- }
12229
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
12230
- split_condition = true;
12231
- }
12232
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
12233
- split_condition = true;
12234
- }
12235
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12236
- split_condition = true;
12237
- }
12238
- }
12239
-
12240
- if (utf_char_next == "") {
12241
- split_condition = true; // final
12242
- token += utf_char;
12243
- }
12244
-
12245
- if (split_condition) {
12246
- if (token.size()) {
12247
- bpe_words.emplace_back(token);
12248
- }
12249
- token = utf_char;
12250
- collecting = false;
12251
- collecting_letter = false;
12252
- collecting_numeric = false;
12253
- collecting_special = false;
12254
- collecting_whitespace_lookahead = false;
12255
- }
12256
- else {
12257
- token += utf_char;
12258
- }
12259
- }
12260
-
12261
- for (std::string & word : bpe_words) {
12262
- std::string encoded_token = "";
12263
- for (char & c : word) {
12264
- encoded_token += unicode_byte_to_utf8(c);
12265
- }
12266
- bpe_encoded_words.emplace_back(encoded_token);
12267
- }
12268
-
12269
- return bpe_encoded_words;
12270
- }
12271
-
12272
12384
  const llama_vocab & vocab;
12273
12385
 
12274
12386
  std::vector<llm_symbol> symbols;
@@ -12588,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12588
12700
  } break;
12589
12701
  case LLAMA_VOCAB_TYPE_BPE:
12590
12702
  {
12591
- if (add_special && vocab.special_add_bos == 1) {
12703
+ if (add_special && vocab.special_add_bos != 0) {
12592
12704
  GGML_ASSERT(vocab.special_bos_id != -1);
12593
12705
  output.push_back(vocab.special_bos_id);
12594
12706
  }
@@ -14360,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14360
14472
  }
14361
14473
 
14362
14474
  static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14363
- std::mutex mutex;
14364
- int64_t counter = 0;
14365
- size_t new_size = 0;
14366
14475
  if (nthread < 2) {
14367
14476
  // single-thread
14368
- return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14477
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14478
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
14479
+ throw std::runtime_error("quantized data validation failed");
14480
+ }
14481
+ return new_size;
14369
14482
  }
14370
- auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
14483
+
14484
+ std::mutex mutex;
14485
+ int64_t counter = 0;
14486
+ size_t new_size = 0;
14487
+ bool valid = true;
14488
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
14371
14489
  nrows, n_per_row, imatrix]() {
14372
14490
  const int64_t nrows_per_chunk = chunk_size / n_per_row;
14373
14491
  size_t local_size = 0;
@@ -14382,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14382
14500
  }
14383
14501
  lock.unlock();
14384
14502
  const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14385
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14503
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14504
+ local_size += this_size;
14505
+
14506
+ // validate the quantized data
14507
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
14508
+ void * this_data = (char *) new_data + first_row * row_size;
14509
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
14510
+ std::unique_lock<std::mutex> lock(mutex);
14511
+ valid = false;
14512
+ break;
14513
+ }
14386
14514
  }
14387
14515
  };
14388
14516
  for (int it = 0; it < nthread - 1; ++it) {
@@ -14391,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14391
14519
  compute();
14392
14520
  for (auto & w : workers) { w.join(); }
14393
14521
  workers.clear();
14522
+ if (!valid) {
14523
+ throw std::runtime_error("quantized data validation failed");
14524
+ }
14394
14525
  return new_size;
14395
14526
  }
14396
14527
 
@@ -14453,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14453
14584
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
14454
14585
  kv_overrides = v->data();
14455
14586
  }
14456
- llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
14587
+ llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
14457
14588
  ml.init_mappings(false); // no prefetching
14458
14589
 
14459
14590
  llama_model model;
@@ -14491,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14491
14622
  for (auto & o : overrides) {
14492
14623
  if (o.key[0] == 0) break;
14493
14624
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
14494
- gguf_set_val_f32(ctx_out, o.key, o.float_value);
14625
+ gguf_set_val_f32(ctx_out, o.key, o.val_f64);
14495
14626
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
14496
- gguf_set_val_i32(ctx_out, o.key, o.int_value);
14627
+ gguf_set_val_i32(ctx_out, o.key, o.val_i64);
14497
14628
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
14498
- gguf_set_val_bool(ctx_out, o.key, o.bool_value);
14629
+ gguf_set_val_bool(ctx_out, o.key, o.val_bool);
14630
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
14631
+ gguf_set_val_str(ctx_out, o.key, o.val_str);
14499
14632
  } else {
14500
14633
  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
14501
14634
  }
@@ -14814,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
14814
14947
  std::unique_ptr<llama_model_loader> ml;
14815
14948
  if (path_base_model) {
14816
14949
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
14817
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
14950
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
14818
14951
  ml->init_mappings(/*prefetch*/ false); // no prefetching
14819
14952
  }
14820
14953
 
@@ -15073,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
15073
15206
  /*.vocab_only =*/ false,
15074
15207
  /*.use_mmap =*/ true,
15075
15208
  /*.use_mlock =*/ false,
15209
+ /*.check_tensors =*/ false,
15076
15210
  };
15077
15211
 
15078
15212
  #ifdef GGML_USE_METAL
@@ -15109,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
15109
15243
  /*.logits_all =*/ false,
15110
15244
  /*.embeddings =*/ false,
15111
15245
  /*.offload_kqv =*/ true,
15246
+ /*.flash_attn =*/ false,
15112
15247
  /*.abort_callback =*/ nullptr,
15113
15248
  /*.abort_callback_data =*/ nullptr,
15114
15249
  };
@@ -15275,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
15275
15410
  cparams.defrag_thold = params.defrag_thold;
15276
15411
  cparams.embeddings = params.embeddings;
15277
15412
  cparams.offload_kqv = params.offload_kqv;
15413
+ cparams.flash_attn = params.flash_attn;
15278
15414
  cparams.pooling_type = params.pooling_type;
15279
15415
 
15280
15416
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -15282,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
15282
15418
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15283
15419
 
15284
15420
  // this is necessary due to kv_self.n being padded later during inference
15285
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
15421
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15286
15422
 
15287
15423
  // with causal attention, the batch size is limited by the context size
15288
15424
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
15289
- cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15290
15425
 
15426
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
15427
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
15428
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
15429
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
15430
+ LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
15431
+ cparams.n_batch = GGML_KQ_MASK_PAD;
15432
+ }
15433
+
15434
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15291
15435
 
15292
15436
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15293
15437
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
@@ -15319,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
15319
15463
  }
15320
15464
  }
15321
15465
 
15466
+ if (cparams.flash_attn && hparams.use_alibi) {
15467
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15468
+ cparams.flash_attn = false;
15469
+ }
15470
+
15471
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15472
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15473
+ cparams.flash_attn = false;
15474
+ }
15475
+
15476
+ #ifdef GGML_USE_HIPBLAS
15477
+ if (cparams.flash_attn) {
15478
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
15479
+ cparams.flash_attn = false;
15480
+ }
15481
+ #endif
15482
+
15322
15483
  if (params.seed == LLAMA_DEFAULT_SEED) {
15323
15484
  params.seed = time(NULL);
15324
15485
  }
@@ -15326,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
15326
15487
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
15327
15488
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
15328
15489
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
15490
+ LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
15329
15491
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
15330
15492
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
15331
15493
 
@@ -15454,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
15454
15616
  }
15455
15617
  ctx->backends.push_back(ctx->backend_cpu);
15456
15618
 
15457
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
15619
+ if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
15458
15620
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
15459
15621
  llama_free(ctx);
15460
15622
  return nullptr;
@@ -16053,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16053
16215
  const size_t s_kv_head = sizeof(uint32_t);
16054
16216
  const size_t s_kv_size = sizeof(uint32_t);
16055
16217
  const size_t s_kv_used = sizeof(uint32_t);
16218
+ const size_t s_v_trans = sizeof(uint32_t);
16056
16219
  const size_t s_kv = ctx->kv_self.total_size();
16057
16220
  const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
16058
16221
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
@@ -16070,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16070
16233
  + s_kv_head
16071
16234
  + s_kv_size
16072
16235
  + s_kv_used
16236
+ + s_v_trans
16073
16237
  + s_kv
16074
16238
  + s_kv_cells
16075
16239
  );
16076
16240
 
16241
+ // on session change it is very likely that the state size has changed - so we need to update this function
16242
+ static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
16243
+
16077
16244
  return s_total;
16078
16245
  }
16079
16246
 
@@ -16219,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16219
16386
  const uint32_t kv_size = kv_self.size;
16220
16387
  const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
16221
16388
  const uint32_t kv_used = kv_self.used;
16389
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
16222
16390
 
16223
16391
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
16224
16392
  data_ctx->write(&kv_head, sizeof(kv_head));
16225
16393
  data_ctx->write(&kv_size, sizeof(kv_size));
16226
16394
  data_ctx->write(&kv_used, sizeof(kv_used));
16395
+ data_ctx->write(&v_trans, sizeof(v_trans));
16227
16396
 
16228
16397
  if (kv_buf_size) {
16229
16398
  const size_t pre_kv_buf_size = data_ctx->get_size_written();
@@ -16236,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16236
16405
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
16237
16406
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
16238
16407
 
16239
- if (kv_self.recurrent) {
16408
+ if (kv_self.recurrent || !kv_self.v_trans) {
16240
16409
  // v is contiguous for recurrent models
16241
16410
  // TODO: use other tensors for state models than k and v
16242
16411
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16369,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16369
16538
  uint32_t kv_head;
16370
16539
  uint32_t kv_size;
16371
16540
  uint32_t kv_used;
16541
+ uint32_t v_trans;
16372
16542
 
16373
16543
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
16374
16544
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
16375
16545
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
16376
16546
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
16547
+ memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
16548
+
16549
+ GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
16377
16550
 
16378
16551
  if (kv_self.size != kv_size) {
16379
16552
  // the KV cache needs to be big enough to load all the KV cells from the saved state
@@ -16383,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16383
16556
  __func__, kv_head, kv_size, kv_self.size);
16384
16557
  }
16385
16558
 
16559
+ llama_kv_cache_clear(ctx);
16560
+
16386
16561
  if (kv_buf_size) {
16387
16562
  const size_t pre_kv_buf_size = inp - src;
16388
16563
 
@@ -16394,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16394
16569
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
16395
16570
  inp += k_size;
16396
16571
 
16397
- if (kv_self.recurrent) {
16572
+ if (kv_self.recurrent || !kv_self.v_trans) {
16398
16573
  // v is contiguous for recurrent models
16399
16574
  // TODO: use other tensors for state models than k and v
16400
16575
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16416,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16416
16591
  GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
16417
16592
  }
16418
16593
 
16419
- llama_kv_cache_clear(ctx);
16420
-
16421
16594
  ctx->kv_self.head = kv_head;
16422
16595
  ctx->kv_self.used = kv_used;
16423
16596
 
@@ -16677,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16677
16850
  }
16678
16851
  }
16679
16852
 
16680
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16681
- const uint32_t kv_size = kv_self.size;
16682
- for (int il = 0; il < (int)n_layer; ++il) {
16683
- // Write value type
16684
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16685
- data_ctx.write(&v_type_i, sizeof(v_type_i));
16853
+ // TODO: simplify, reduce copy-paste
16854
+ if (!kv_self.v_trans) {
16855
+ for (int il = 0; il < (int)n_layer; ++il) {
16856
+ // Write value type
16857
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16858
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16686
16859
 
16687
- // Write element size
16688
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16689
- data_ctx.write(&v_size_el, sizeof(v_size_el));
16860
+ // Write row size of value
16861
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
16862
+ data_ctx.write(&v_size_row, sizeof(v_size_row));
16690
16863
 
16691
- // For each row, we get the element values of each cell
16692
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16693
- // Read each range of cells of v_size_el length each into tmp_buf and write out
16864
+ // Read each range of cells of v_size length each into tmp_buf and write out
16694
16865
  for (const auto & range : cell_ranges) {
16695
16866
  const size_t range_size = range.second - range.first;
16696
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16697
- tmp_buf.resize(range_size * v_size_el);
16698
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16867
+ tmp_buf.resize(range_size * v_size_row);
16868
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
16699
16869
  data_ctx.write(tmp_buf.data(), tmp_buf.size());
16700
16870
  }
16701
16871
  }
16872
+ } else {
16873
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16874
+ const uint32_t kv_size = kv_self.size;
16875
+ for (int il = 0; il < (int)n_layer; ++il) {
16876
+ // Write value type
16877
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16878
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16879
+
16880
+ // Write element size
16881
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16882
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16883
+
16884
+ // For each row, we get the element values of each cell
16885
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16886
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16887
+ for (const auto & range : cell_ranges) {
16888
+ const size_t range_size = range.second - range.first;
16889
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16890
+ tmp_buf.resize(range_size * v_size_el);
16891
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16892
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16893
+ }
16894
+ }
16895
+ }
16702
16896
  }
16703
16897
 
16704
16898
  return data_ctx.get_size_written();
@@ -16823,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
16823
17017
  }
16824
17018
  }
16825
17019
 
16826
- // For each layer, read the values for each cell (transposed)
16827
- for (int il = 0; il < (int)n_layer; ++il) {
16828
- // Read type of value
16829
- int32_t v_type_i_ref;
16830
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16831
- inp += sizeof(v_type_i_ref);
16832
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16833
- if (v_type_i != v_type_i_ref) {
16834
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16835
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16836
- return 0;
16837
- }
17020
+ // TODO: simplify, reduce copy-paste
17021
+ if (!kv_self.v_trans) {
17022
+ for (int il = 0; il < (int)n_layer; ++il) {
17023
+ // Read type of value
17024
+ int32_t v_type_i_ref;
17025
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17026
+ inp += sizeof(v_type_i_ref);
17027
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17028
+ if (v_type_i != v_type_i_ref) {
17029
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17030
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17031
+ return 0;
17032
+ }
16838
17033
 
16839
- // Read element size of value
16840
- size_t v_size_el_ref;
16841
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16842
- inp += sizeof(v_size_el_ref);
16843
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16844
- if (v_size_el != v_size_el_ref) {
16845
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16846
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16847
- return 0;
16848
- }
17034
+ // Read row size of value
17035
+ size_t v_size_row_ref;
17036
+ memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
17037
+ inp += sizeof(v_size_row_ref);
17038
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
17039
+ if (v_size_row != v_size_row_ref) {
17040
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17041
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
17042
+ return 0;
17043
+ }
16849
17044
 
16850
- if (cell_count) {
16851
- // For each row in the transposed matrix, read the values for the whole cell range
16852
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16853
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16854
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16855
- inp += cell_count * v_size_el;
17045
+ if (cell_count) {
17046
+ // Read and set the values for the whole cell range
17047
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
17048
+ inp += cell_count * v_size_row;
17049
+ }
17050
+ }
17051
+ } else {
17052
+ // For each layer, read the values for each cell (transposed)
17053
+ for (int il = 0; il < (int)n_layer; ++il) {
17054
+ // Read type of value
17055
+ int32_t v_type_i_ref;
17056
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17057
+ inp += sizeof(v_type_i_ref);
17058
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17059
+ if (v_type_i != v_type_i_ref) {
17060
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17061
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17062
+ return 0;
17063
+ }
17064
+
17065
+ // Read element size of value
17066
+ size_t v_size_el_ref;
17067
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
17068
+ inp += sizeof(v_size_el_ref);
17069
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
17070
+ if (v_size_el != v_size_el_ref) {
17071
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17072
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
17073
+ return 0;
17074
+ }
17075
+
17076
+ if (cell_count) {
17077
+ // For each row in the transposed matrix, read the values for the whole cell range
17078
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
17079
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
17080
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
17081
+ inp += cell_count * v_size_el;
17082
+ }
16856
17083
  }
16857
17084
  }
16858
17085
  }
16859
17086
 
16860
17087
  const size_t nread = inp - src;
17088
+
16861
17089
  return nread;
16862
17090
  }
16863
17091
 
@@ -17654,9 +17882,9 @@ const char * llama_print_system_info(void) {
17654
17882
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17655
17883
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
17884
  #ifdef GGML_USE_LLAMAFILE
17657
- s += "LAMMAFILE = 1 | ";
17885
+ s += "LLAMAFILE = 1 | ";
17658
17886
  #else
17659
- s += "LAMMAFILE = 0 | ";
17887
+ s += "LLAMAFILE = 0 | ";
17660
17888
  #endif
17661
17889
 
17662
17890
  return s.c_str();