llama_cpp 0.14.7 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -75,6 +75,7 @@
75
75
  #include <forward_list>
76
76
  #include <fstream>
77
77
  #include <functional>
78
+ #include <future>
78
79
  #include <initializer_list>
79
80
  #include <locale>
80
81
  #include <map>
@@ -107,7 +108,6 @@
107
108
  #define LLAMA_MAX_NODES 8192
108
109
  #define LLAMA_MAX_EXPERTS 60
109
110
 
110
-
111
111
  //
112
112
  // logging
113
113
  //
@@ -316,6 +316,7 @@ enum llm_kv {
316
316
  LLM_KV_SSM_TIME_STEP_RANK,
317
317
 
318
318
  LLM_KV_TOKENIZER_MODEL,
319
+ LLM_KV_TOKENIZER_PRE,
319
320
  LLM_KV_TOKENIZER_LIST,
320
321
  LLM_KV_TOKENIZER_TOKEN_TYPE,
321
322
  LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
392
393
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
393
394
 
394
395
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
396
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
395
397
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
396
398
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
397
399
  { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@@ -1843,7 +1845,7 @@ struct llama_hparams {
1843
1845
  float f_logit_scale = 0.0f;
1844
1846
 
1845
1847
  bool causal_attn = true;
1846
- bool need_kq_pos = false;
1848
+ bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1847
1849
 
1848
1850
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1849
1851
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1933,6 +1935,7 @@ struct llama_cparams {
1933
1935
  bool embeddings;
1934
1936
  bool causal_attn;
1935
1937
  bool offload_kqv;
1938
+ bool flash_attn;
1936
1939
 
1937
1940
  enum llama_pooling_type pooling_type;
1938
1941
 
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
2036
2039
  bool has_shift = false;
2037
2040
  bool do_defrag = false;
2038
2041
  bool do_copy = false;
2039
- // with recurrent state models, a cell can hold the state for more than one past token
2040
- bool recurrent = false;
2042
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
2043
+ bool v_trans = true; // the value tensor is transposed
2041
2044
 
2042
2045
  // Note: The value of head isn't only used to optimize searching
2043
2046
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2114,7 +2117,8 @@ struct llama_vocab {
2114
2117
  ttype type;
2115
2118
  };
2116
2119
 
2117
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2120
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2121
+ enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2118
2122
 
2119
2123
  std::unordered_map<token, id> token_to_id;
2120
2124
  std::vector<token_data> id_to_token;
@@ -2335,11 +2339,14 @@ struct llama_context {
2335
2339
 
2336
2340
  static bool llama_kv_cache_init(
2337
2341
  struct llama_kv_cache & cache,
2338
- const llama_model & model,
2342
+ const llama_context * ctx,
2339
2343
  ggml_type type_k,
2340
2344
  ggml_type type_v,
2341
2345
  uint32_t kv_size,
2342
2346
  bool offload) {
2347
+ const llama_model & model = ctx->model;
2348
+ const llama_cparams & cparams = ctx->cparams;
2349
+
2343
2350
  const struct llama_hparams & hparams = model.hparams;
2344
2351
 
2345
2352
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
2350
2357
 
2351
2358
  // TODO: find a nicer way to add other recurrent model architectures
2352
2359
  cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2360
+ cache.v_trans = !cparams.flash_attn;
2353
2361
 
2354
- // TODO: support mixed reccurent Transformer architectues
2362
+ // TODO: support mixed recurrent Transformer architectures
2355
2363
  // NOTE: (!a || b) is a logical implication (a -> b)
2356
2364
  GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2357
2365
  GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2562
2570
  }
2563
2571
  cache.head = 0;
2564
2572
  cache.used = 0;
2573
+
2574
+ for (auto & buf : cache.bufs) {
2575
+ ggml_backend_buffer_clear(buf, 0);
2576
+ }
2565
2577
  }
2566
2578
 
2567
2579
  static bool llama_kv_cache_seq_rm(
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
2882
2894
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2883
2895
  case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2884
2896
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2897
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
2885
2898
  }
2886
2899
  return "unknown";
2887
2900
  }
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
2893
2906
  __func__, override_type_to_str(ovrd->tag), ovrd->key);
2894
2907
  switch (ovrd->tag) {
2895
2908
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2896
- LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2909
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
2897
2910
  } break;
2898
2911
  case LLAMA_KV_OVERRIDE_TYPE_INT: {
2899
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2912
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
2900
2913
  } break;
2901
2914
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2902
- LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2915
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
2916
+ } break;
2917
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
2918
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
2903
2919
  } break;
2904
2920
  default:
2905
2921
  // Shouldn't be possible to end up here, but just in case...
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
2918
2934
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2919
2935
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2920
2936
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2921
- target = ovrd->bool_value;
2937
+ target = ovrd->val_bool;
2922
2938
  return true;
2923
2939
  }
2924
2940
  return false;
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
2928
2944
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2929
2945
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2930
2946
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2931
- target = ovrd->int_value;
2947
+ target = ovrd->val_i64;
2932
2948
  return true;
2933
2949
  }
2934
2950
  return false;
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
2938
2954
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2939
2955
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2940
2956
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2941
- target = ovrd->float_value;
2957
+ target = ovrd->val_f64;
2942
2958
  return true;
2943
2959
  }
2944
2960
  return false;
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
2947
2963
  template<typename OT>
2948
2964
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2949
2965
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2950
- (void)target;
2951
- (void)ovrd;
2952
- if (!ovrd) { return false; }
2953
- // Currently, we should never end up here so it would be a bug if we do.
2954
- throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2955
- ovrd ? ovrd->key : "NULL"));
2966
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
2967
+ target = ovrd->val_str;
2968
+ return true;
2969
+ }
2970
+ return false;
2956
2971
  }
2957
2972
 
2958
2973
  static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
2985
3000
  size_t n_bytes = 0;
2986
3001
 
2987
3002
  bool use_mmap = false;
3003
+ bool check_tensors;
2988
3004
 
2989
3005
  llama_files files;
2990
3006
  llama_ftype ftype;
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
3018
3034
  std::string arch_name;
3019
3035
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
3020
3036
 
3021
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
3037
+ llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
3022
3038
  int trace = 0;
3023
3039
  if (getenv("LLAMA_TRACE")) {
3024
3040
  trace = atoi(getenv("LLAMA_TRACE"));
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
3115
3131
 
3116
3132
  fver = (enum llama_fver) gguf_get_version(meta);
3117
3133
 
3134
+ std::set<std::string> tensor_names;
3118
3135
  for (auto & w : weights) {
3119
3136
  n_elements += ggml_nelements(w.tensor);
3120
3137
  n_bytes += ggml_nbytes(w.tensor);
3138
+ // make sure there is no duplicated tensor names
3139
+ const std::string name(w.tensor->name);
3140
+ auto found = tensor_names.find(name);
3141
+ if (found != tensor_names.end()) {
3142
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
3143
+ }
3144
+ tensor_names.insert(name);
3121
3145
  }
3122
3146
 
3123
3147
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -3223,6 +3247,7 @@ struct llama_model_loader {
3223
3247
  }
3224
3248
 
3225
3249
  this->use_mmap = use_mmap;
3250
+ this->check_tensors = check_tensors;
3226
3251
  }
3227
3252
 
3228
3253
  ~llama_model_loader() {
@@ -3481,6 +3506,10 @@ struct llama_model_loader {
3481
3506
  file->seek(w.offs, SEEK_SET);
3482
3507
  file->read_raw(cur->data, ggml_nbytes(cur));
3483
3508
  }
3509
+
3510
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
3511
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3512
+ }
3484
3513
  }
3485
3514
 
3486
3515
  size_t size_done = 0;
@@ -3497,6 +3526,8 @@ struct llama_model_loader {
3497
3526
  GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3498
3527
 
3499
3528
  std::vector<no_init<uint8_t>> read_buf;
3529
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3530
+
3500
3531
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3501
3532
  const auto * weight = get_weight(ggml_get_name(cur));
3502
3533
  if (weight == nullptr) {
@@ -3518,37 +3549,66 @@ struct llama_model_loader {
3518
3549
  if (bufs_mmap.count(weight->idx)) {
3519
3550
  buf_mmap = bufs_mmap.at(weight->idx);
3520
3551
  }
3552
+ uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
3553
+
3554
+ if (check_tensors) {
3555
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
3556
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
3557
+ }));
3558
+ }
3559
+
3521
3560
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3522
3561
  if (buf_mmap && cur->data == nullptr) {
3523
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3562
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
3524
3563
  if (lmlocks) {
3525
3564
  const auto & lmlock = lmlocks->at(weight->idx);
3526
- lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3565
+ lmlock->grow_to(weight->offs + n_size);
3527
3566
  }
3528
3567
 
3529
3568
  auto & mmap_used = mmaps_used[weight->idx];
3530
3569
  mmap_used.first = std::min(mmap_used.first, weight->offs);
3531
3570
  mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3532
3571
  } else {
3533
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3572
+ ggml_backend_tensor_set(cur, data, 0, n_size);
3534
3573
  }
3535
3574
  } else {
3536
3575
  GGML_ASSERT(weight->idx < files.size());
3537
3576
  const auto & file = files.at(weight->idx);
3538
3577
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3539
3578
  file->seek(weight->offs, SEEK_SET);
3540
- file->read_raw(cur->data, ggml_nbytes(cur));
3579
+ file->read_raw(cur->data, n_size);
3580
+ if (check_tensors) {
3581
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3582
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
3583
+ }));
3584
+ }
3541
3585
  } else {
3542
- read_buf.resize(ggml_nbytes(cur));
3586
+ read_buf.resize(n_size);
3543
3587
  file->seek(weight->offs, SEEK_SET);
3544
- file->read_raw(read_buf.data(), ggml_nbytes(cur));
3588
+ file->read_raw(read_buf.data(), n_size);
3545
3589
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3590
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3591
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3592
+ }
3546
3593
  }
3547
3594
  }
3548
3595
 
3549
3596
  size_done += n_size;
3550
3597
  }
3551
3598
 
3599
+ // check validation results
3600
+ bool validation_failed = false;
3601
+ for (auto & future : validation_result) {
3602
+ auto result = future.get();
3603
+ if (!result.second) {
3604
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
3605
+ validation_failed = true;
3606
+ }
3607
+ }
3608
+ if (validation_failed) {
3609
+ throw std::runtime_error("found tensors with invalid data");
3610
+ }
3611
+
3552
3612
  // check if this is the last call and do final cleanup
3553
3613
  if (size_done >= size_data) {
3554
3614
  // unmap offloaded tensors and metadata
@@ -4142,7 +4202,7 @@ static void llm_load_hparams(
4142
4202
  model.ftype = ml.ftype;
4143
4203
 
4144
4204
  if (hparams.f_max_alibi_bias > 0.0f) {
4145
- hparams.need_kq_pos = true;
4205
+ hparams.use_alibi = true;
4146
4206
  }
4147
4207
 
4148
4208
  hparams.rope_type = llama_rope_type(&model);
@@ -4165,11 +4225,13 @@ static void llm_load_vocab(
4165
4225
 
4166
4226
  // determine vocab type
4167
4227
  {
4168
- std::string tokenizer_name;
4228
+ std::string tokenizer_model;
4229
+ std::string tokenizer_pre;
4169
4230
 
4170
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
4231
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
4232
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4171
4233
 
4172
- if (tokenizer_name == "no_vocab") {
4234
+ if (tokenizer_model == "no_vocab") {
4173
4235
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
4174
4236
 
4175
4237
  // default special tokens
@@ -4183,7 +4245,7 @@ static void llm_load_vocab(
4183
4245
  vocab.linefeed_id = -1;
4184
4246
 
4185
4247
  return;
4186
- } else if (tokenizer_name == "llama") {
4248
+ } else if (tokenizer_model == "llama") {
4187
4249
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4188
4250
 
4189
4251
  // default special tokens
@@ -4228,9 +4290,27 @@ static void llm_load_vocab(
4228
4290
  if (add_space_prefix_keyidx != -1) {
4229
4291
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4230
4292
  } // The default value of add_space_prefix is true.
4231
- } else if (tokenizer_name == "gpt2") {
4232
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4293
+ } else if (tokenizer_model == "bert") {
4294
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4233
4295
 
4296
+ // default special tokens
4297
+ vocab.special_bos_id = -1;
4298
+ vocab.special_eos_id = -1;
4299
+ vocab.special_unk_id = 100;
4300
+ vocab.special_sep_id = 102;
4301
+ vocab.special_pad_id = 0;
4302
+ vocab.special_cls_id = 101;
4303
+ vocab.special_mask_id = 103;
4304
+ vocab.add_space_prefix = false;
4305
+ } else {
4306
+ if (tokenizer_model == "gpt2") {
4307
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4308
+ } else {
4309
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4310
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4311
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
4312
+ return;
4313
+ }
4234
4314
  // read bpe merges and populate bpe ranks
4235
4315
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4236
4316
  if (merges_keyidx == -1) {
@@ -4264,23 +4344,50 @@ static void llm_load_vocab(
4264
4344
  vocab.special_pad_id = -1;
4265
4345
  vocab.special_cls_id = -1;
4266
4346
  vocab.special_mask_id = -1;
4267
- } else if (tokenizer_name == "bert") {
4268
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4347
+ }
4269
4348
 
4270
- // default special tokens
4271
- vocab.special_bos_id = -1;
4272
- vocab.special_eos_id = -1;
4273
- vocab.special_unk_id = 100;
4274
- vocab.special_sep_id = 102;
4275
- vocab.special_pad_id = 0;
4276
- vocab.special_cls_id = 101;
4277
- vocab.special_mask_id = 103;
4278
- vocab.add_space_prefix = false;
4349
+ // for now, only BPE models have pre-tokenizers
4350
+ if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4351
+ if (tokenizer_pre.empty()) {
4352
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4353
+ LLAMA_LOG_WARN("%s: \n", __func__);
4354
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4355
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4356
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4357
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4358
+ LLAMA_LOG_WARN("%s: \n", __func__);
4359
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4360
+ } else if (
4361
+ tokenizer_pre == "default") {
4362
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4363
+ } else if (
4364
+ tokenizer_pre == "llama3" ||
4365
+ tokenizer_pre == "llama-v3" ||
4366
+ tokenizer_pre == "llama-bpe") {
4367
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4368
+ } else if (
4369
+ tokenizer_pre == "deepseek-llm") {
4370
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4371
+ } else if (
4372
+ tokenizer_pre == "deepseek-coder") {
4373
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4374
+ } else if (
4375
+ tokenizer_pre == "falcon") {
4376
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4377
+ } else if (
4378
+ tokenizer_pre == "mpt") {
4379
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4380
+ } else if (
4381
+ tokenizer_pre == "starcoder") {
4382
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4383
+ } else if (
4384
+ tokenizer_pre == "gpt-2") {
4385
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386
+ } else {
4387
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4388
+ }
4279
4389
  } else {
4280
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4281
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4282
-
4283
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4284
4391
  }
4285
4392
  }
4286
4393
 
@@ -5975,7 +6082,7 @@ static bool llm_load_tensors(
5975
6082
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
5976
6083
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
5977
6084
  try {
5978
- llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
6085
+ llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
5979
6086
 
5980
6087
  model.hparams.vocab_only = params.vocab_only;
5981
6088
 
@@ -6104,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
6104
6211
  static void llm_build_kv_store(
6105
6212
  struct ggml_context * ctx,
6106
6213
  const llama_hparams & hparams,
6214
+ const llama_cparams & cparams,
6107
6215
  const llama_kv_cache & kv,
6108
6216
  struct ggml_cgraph * graph,
6109
6217
  struct ggml_tensor * k_cur,
6110
6218
  struct ggml_tensor * v_cur,
6111
- int64_t n_ctx,
6112
6219
  int32_t n_tokens,
6113
6220
  int32_t kv_head,
6114
6221
  const llm_build_cb & cb,
6115
6222
  int64_t il) {
6223
+ const int64_t n_ctx = cparams.n_ctx;
6224
+
6116
6225
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6117
6226
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6118
6227
 
6119
6228
  GGML_ASSERT(kv.size == n_ctx);
6120
6229
 
6121
- // compute the transposed [n_tokens, n_embd] V matrix
6122
- assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6123
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
6124
- cb(v_cur_t, "v_cur_t", il);
6125
-
6126
6230
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
6127
6231
  (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
6128
6232
  cb(k_cache_view, "k_cache_view", il);
6129
6233
 
6130
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6131
- ( n_ctx)*ggml_element_size(kv.v_l[il]),
6132
- (kv_head)*ggml_element_size(kv.v_l[il]));
6234
+ // note: storing RoPE-ed version of K in the KV cache
6235
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6236
+
6237
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6238
+
6239
+ struct ggml_tensor * v_cache_view = nullptr;
6240
+
6241
+ if (cparams.flash_attn) {
6242
+ v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
6243
+ (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
6244
+ } else {
6245
+ // note: the V cache is transposed when not using flash attention
6246
+ v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6247
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
6248
+ (kv_head)*ggml_element_size(kv.v_l[il]));
6249
+
6250
+ v_cur = ggml_transpose(ctx, v_cur);
6251
+ }
6133
6252
  cb(v_cache_view, "v_cache_view", il);
6134
6253
 
6135
- // important: storing RoPE-ed version of K in the KV cache!
6136
- ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6137
- ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
6254
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
6138
6255
  }
6139
6256
 
6140
6257
  static struct ggml_tensor * llm_build_norm(
@@ -6354,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
6354
6471
  return moe_out;
6355
6472
  }
6356
6473
 
6357
- // if max_alibi_bias > 0 then apply ALiBi
6358
6474
  static struct ggml_tensor * llm_build_kqv(
6359
6475
  struct ggml_context * ctx,
6360
6476
  const llama_model & model,
6361
6477
  const llama_hparams & hparams,
6478
+ const llama_cparams & cparams,
6362
6479
  const llama_kv_cache & kv,
6363
6480
  struct ggml_cgraph * graph,
6364
6481
  struct ggml_tensor * wo,
@@ -6366,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
6366
6483
  struct ggml_tensor * q_cur,
6367
6484
  struct ggml_tensor * kq_mask,
6368
6485
  struct ggml_tensor * kq_pos,
6369
- int64_t n_ctx,
6370
6486
  int32_t n_tokens,
6371
6487
  int32_t n_kv,
6372
6488
  float kq_scale,
6373
6489
  const llm_build_cb & cb,
6374
6490
  int il) {
6491
+ const int64_t n_ctx = cparams.n_ctx;
6375
6492
  const int64_t n_head = hparams.n_head;
6376
6493
  const int64_t n_head_kv = hparams.n_head_kv;
6377
6494
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -6389,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
6389
6506
  0);
6390
6507
  cb(k, "k", il);
6391
6508
 
6392
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6393
- cb(kq, "kq", il);
6509
+ struct ggml_tensor * cur;
6394
6510
 
6395
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6396
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6397
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6398
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6399
- }
6511
+ if (cparams.flash_attn) {
6512
+ GGML_UNUSED(model);
6513
+ GGML_UNUSED(n_ctx);
6400
6514
 
6401
- if (model.arch == LLM_ARCH_GROK) {
6402
- // need to do the following:
6403
- // multiply by attn_output_multiplyer of 0.08838834764831845
6404
- // and then :
6405
- // kq = 30 * tanh(kq / 30)
6406
- // before the softmax below
6515
+ // note: if this assert triggers, then some check has failed earlier
6516
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6517
+ GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6407
6518
 
6408
- //try from phi2
6409
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6519
+ // split cached v into n_head heads (not transposed)
6520
+ struct ggml_tensor * v =
6521
+ ggml_view_3d(ctx, kv.v_l[il],
6522
+ n_embd_head_v, n_kv, n_head_kv,
6523
+ ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6524
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6525
+ 0);
6526
+ cb(v, "v", il);
6410
6527
 
6411
- kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6412
- kq = ggml_scale(ctx, kq, 30);
6413
- }
6528
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6529
+
6530
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6531
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6532
+ }
6533
+
6534
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6535
+ } else {
6536
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6537
+ cb(kq, "kq", il);
6538
+
6539
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6540
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6541
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6542
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6543
+ }
6544
+
6545
+ if (model.arch == LLM_ARCH_GROK) {
6546
+ // need to do the following:
6547
+ // multiply by attn_output_multiplyer of 0.08838834764831845
6548
+ // and then :
6549
+ // kq = 30 * tanh(kq / 30)
6550
+ // before the softmax below
6551
+
6552
+ //try from phi2
6553
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6554
+
6555
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6556
+ kq = ggml_scale(ctx, kq, 30);
6557
+ }
6414
6558
 
6415
6559
  #if defined(GGML_USE_KOMPUTE)
6416
6560
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6417
6561
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6418
6562
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6419
- if (hparams.f_max_alibi_bias > 0.0f) {
6420
- kq = ggml_scale(ctx, kq, kq_scale);
6421
- cb(kq, "kq_scaled", il);
6563
+ if (hparams.use_alibi) {
6564
+ kq = ggml_scale(ctx, kq, kq_scale);
6565
+ cb(kq, "kq_scaled", il);
6422
6566
 
6423
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6424
- cb(kq, "kq_scaled_alibi", il);
6567
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6568
+ cb(kq, "kq_scaled_alibi", il);
6425
6569
 
6426
- kq = ggml_add(ctx, kq, kq_mask);
6427
- cb(kq, "kq_masked", il);
6570
+ kq = ggml_add(ctx, kq, kq_mask);
6571
+ cb(kq, "kq_masked", il);
6428
6572
 
6429
- kq = ggml_soft_max(ctx, kq);
6430
- cb(kq, "kq_soft_max", il);
6431
- } else
6573
+ kq = ggml_soft_max(ctx, kq);
6574
+ cb(kq, "kq_soft_max", il);
6575
+ } else
6432
6576
  #endif
6433
- {
6434
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6435
- cb(kq, "kq_soft_max_ext", il);
6436
- }
6577
+ {
6578
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6579
+ cb(kq, "kq_soft_max_ext", il);
6580
+ }
6437
6581
 
6438
- GGML_ASSERT(kv.size == n_ctx);
6582
+ GGML_ASSERT(kv.size == n_ctx);
6439
6583
 
6440
- // split cached v into n_head heads
6441
- struct ggml_tensor * v =
6442
- ggml_view_3d(ctx, kv.v_l[il],
6443
- n_kv, n_embd_head_v, n_head_kv,
6444
- ggml_element_size(kv.v_l[il])*n_ctx,
6445
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6446
- 0);
6447
- cb(v, "v", il);
6584
+ // split cached v into n_head heads
6585
+ struct ggml_tensor * v =
6586
+ ggml_view_3d(ctx, kv.v_l[il],
6587
+ n_kv, n_embd_head_v, n_head_kv,
6588
+ ggml_element_size(kv.v_l[il])*n_ctx,
6589
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6590
+ 0);
6591
+ cb(v, "v", il);
6448
6592
 
6449
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6450
- cb(kqv, "kqv", il);
6593
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6594
+ cb(kqv, "kqv", il);
6451
6595
 
6452
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6453
- cb(kqv_merged, "kqv_merged", il);
6596
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6597
+ cb(kqv_merged, "kqv_merged", il);
6454
6598
 
6455
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6456
- cb(cur, "kqv_merged_cont", il);
6599
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6600
+ cb(cur, "kqv_merged_cont", il);
6601
+ }
6457
6602
 
6458
6603
  ggml_build_forward_expand(graph, cur);
6459
6604
 
@@ -6473,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
6473
6618
  struct ggml_context * ctx,
6474
6619
  const llama_model & model,
6475
6620
  const llama_hparams & hparams,
6621
+ const llama_cparams & cparams,
6476
6622
  const llama_kv_cache & kv,
6477
6623
  struct ggml_cgraph * graph,
6478
6624
  struct ggml_tensor * wo,
@@ -6482,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
6482
6628
  struct ggml_tensor * q_cur,
6483
6629
  struct ggml_tensor * kq_mask,
6484
6630
  struct ggml_tensor * kq_pos,
6485
- int64_t n_ctx,
6486
6631
  int32_t n_tokens,
6487
6632
  int32_t kv_head,
6488
6633
  int32_t n_kv,
@@ -6496,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
6496
6641
  ggml_build_forward_expand(graph, k_cur);
6497
6642
  ggml_build_forward_expand(graph, v_cur);
6498
6643
 
6499
- llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
6644
+ llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
6500
6645
 
6501
6646
  struct ggml_tensor * cur;
6502
6647
 
6503
- cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
6504
- q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
6648
+ cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6649
+ q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6505
6650
  cb(cur, "kqv_out", il);
6506
6651
 
6507
6652
  return cur;
@@ -6543,6 +6688,8 @@ struct llm_build_context {
6543
6688
  const int32_t kv_head; // index of where we store new KV data in the cache
6544
6689
  const int32_t n_orig_ctx;
6545
6690
 
6691
+ const bool flash_attn;
6692
+
6546
6693
  const enum llama_pooling_type pooling_type;
6547
6694
  const enum llama_rope_type rope_type;
6548
6695
 
@@ -6589,6 +6736,7 @@ struct llm_build_context {
6589
6736
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6590
6737
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6591
6738
  n_orig_ctx (cparams.n_yarn_orig_ctx),
6739
+ flash_attn (cparams.flash_attn),
6592
6740
  pooling_type (cparams.pooling_type),
6593
6741
  rope_type (hparams.rope_type),
6594
6742
  cb (cb),
@@ -6703,15 +6851,31 @@ struct llm_build_context {
6703
6851
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6704
6852
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
6705
6853
 
6706
- ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6707
- nm, n_embd_v_gqa,
6708
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6709
- ggml_row_size(kv_self.v_l[il]->type, i));
6854
+ ggml_tensor * view_v_src;
6855
+ ggml_tensor * view_v_dst;
6710
6856
 
6711
- ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6712
- nm, n_embd_v_gqa,
6713
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6714
- ggml_row_size(kv_self.v_l[il]->type, id));
6857
+ if (flash_attn) {
6858
+ // NOTE: the V cache is not transposed when using flash attention
6859
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6860
+ n_embd_v_gqa, nm,
6861
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6862
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
6863
+
6864
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6865
+ n_embd_v_gqa, nm,
6866
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6867
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
6868
+ } else {
6869
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6870
+ nm, n_embd_v_gqa,
6871
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6872
+ ggml_row_size(kv_self.v_l[il]->type, i));
6873
+
6874
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6875
+ nm, n_embd_v_gqa,
6876
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6877
+ ggml_row_size(kv_self.v_l[il]->type, id));
6878
+ }
6715
6879
 
6716
6880
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
6717
6881
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
@@ -6741,20 +6905,26 @@ struct llm_build_context {
6741
6905
 
6742
6906
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
6743
6907
  if (causal) {
6744
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
6908
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6745
6909
  } else {
6746
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6910
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6747
6911
  }
6748
6912
  cb(lctx.inp_KQ_mask, "KQ_mask", -1);
6749
6913
  ggml_set_input(lctx.inp_KQ_mask);
6750
- return lctx.inp_KQ_mask;
6914
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6751
6915
  }
6752
6916
 
6753
- struct ggml_tensor * build_inp_KQ_pos() {
6754
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6917
+ struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6918
+ if (causal) {
6919
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6920
+ } else {
6921
+ // TODO: this will be needed for ALiBi-based BERT models
6922
+ // https://github.com/ggerganov/llama.cpp/pull/6826
6923
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6924
+ }
6755
6925
  cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6756
6926
  ggml_set_input(lctx.inp_KQ_pos);
6757
- return lctx.inp_KQ_pos;
6927
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6758
6928
  }
6759
6929
 
6760
6930
  struct ggml_tensor * build_inp_mean() {
@@ -6860,9 +7030,9 @@ struct llm_build_context {
6860
7030
  );
6861
7031
  cb(Kcur, "Kcur", il);
6862
7032
 
6863
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7033
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6864
7034
  model.layers[il].wo, model.layers[il].bo,
6865
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7035
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6866
7036
  }
6867
7037
 
6868
7038
  if (il == n_layer - 1) {
@@ -7000,9 +7170,9 @@ struct llm_build_context {
7000
7170
  cb(Qcur, "Qcur", il);
7001
7171
  cb(Kcur, "Kcur", il);
7002
7172
 
7003
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7173
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7004
7174
  model.layers[il].wo, NULL,
7005
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7175
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7006
7176
  }
7007
7177
 
7008
7178
  if (il == n_layer - 1) {
@@ -7107,9 +7277,9 @@ struct llm_build_context {
7107
7277
  ext_factor, attn_factor, beta_fast, beta_slow
7108
7278
  );
7109
7279
  cb(Kcur, "Kcur", il);
7110
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7111
7281
  model.layers[il].wo, NULL,
7112
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7282
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7113
7283
  }
7114
7284
 
7115
7285
  if (il == n_layer - 1) {
@@ -7227,9 +7397,9 @@ struct llm_build_context {
7227
7397
  );
7228
7398
  cb(Kcur, "Kcur", il);
7229
7399
 
7230
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7400
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7231
7401
  model.layers[il].wo, NULL,
7232
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7402
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7233
7403
  }
7234
7404
 
7235
7405
  if (il == n_layer - 1) {
@@ -7352,9 +7522,9 @@ struct llm_build_context {
7352
7522
  );
7353
7523
  cb(Kcur, "Kcur", il);
7354
7524
 
7355
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7525
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7356
7526
  model.layers[il].wo, model.layers[il].bo,
7357
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7527
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7358
7528
  }
7359
7529
 
7360
7530
  if (il == n_layer - 1) {
@@ -7504,9 +7674,9 @@ struct llm_build_context {
7504
7674
  );
7505
7675
  cb(Kcur, "Kcur", il);
7506
7676
 
7507
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
- model.layers[il].wo, NULL,
7509
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7677
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7678
+ model.layers[il].wo, NULL,
7679
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7510
7680
  }
7511
7681
 
7512
7682
  if (il == n_layer - 1) {
@@ -7616,9 +7786,9 @@ struct llm_build_context {
7616
7786
 
7617
7787
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7618
7788
 
7619
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7789
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7620
7790
  model.layers[il].wo, model.layers[il].bo,
7621
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7791
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7622
7792
  }
7623
7793
 
7624
7794
  if (il == n_layer - 1) {
@@ -7820,9 +7990,9 @@ struct llm_build_context {
7820
7990
  );
7821
7991
  cb(Vcur, "Vcur", il);
7822
7992
 
7823
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7993
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7824
7994
  model.layers[il].wo, model.layers[il].bo,
7825
- Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7995
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7826
7996
  }
7827
7997
 
7828
7998
  if (il == n_layer - 1) {
@@ -7916,9 +8086,9 @@ struct llm_build_context {
7916
8086
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7917
8087
  cb(Qcur, "Qcur", il);
7918
8088
 
7919
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8089
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7920
8090
  model.layers[il].wo, NULL,
7921
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8091
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7922
8092
  }
7923
8093
 
7924
8094
  if (il == n_layer - 1) {
@@ -8209,9 +8379,9 @@ struct llm_build_context {
8209
8379
 
8210
8380
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8211
8381
 
8212
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8382
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8213
8383
  model.layers[il].wo, model.layers[il].bo,
8214
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8384
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8215
8385
  }
8216
8386
 
8217
8387
  if (il == n_layer - 1) {
@@ -8340,14 +8510,15 @@ struct llm_build_context {
8340
8510
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8341
8511
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8342
8512
 
8343
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8344
- model.layers[il].wo, model.layers[il].bo,
8345
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8513
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8514
+ model.layers[il].wo, model.layers[il].bo,
8515
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8346
8516
  } else {
8347
8517
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8348
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8518
+
8519
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8349
8520
  model.layers[il].wo, model.layers[il].bo,
8350
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8521
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8351
8522
  }
8352
8523
  }
8353
8524
 
@@ -8489,9 +8660,9 @@ struct llm_build_context {
8489
8660
  );
8490
8661
  cb(Kcur, "Kcur", il);
8491
8662
 
8492
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8663
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8493
8664
  model.layers[il].wo, NULL,
8494
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8665
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8495
8666
  }
8496
8667
 
8497
8668
  if (il == n_layer - 1) {
@@ -8607,9 +8778,9 @@ struct llm_build_context {
8607
8778
  );
8608
8779
  cb(Kcur, "Kcur", il);
8609
8780
 
8610
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8781
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8611
8782
  model.layers[il].wo, NULL,
8612
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8783
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
8784
  }
8614
8785
 
8615
8786
  if (il == n_layer - 1) {
@@ -8720,9 +8891,9 @@ struct llm_build_context {
8720
8891
  );
8721
8892
  cb(Kcur, "Kcur", il);
8722
8893
 
8723
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8894
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8724
8895
  model.layers[il].wo, model.layers[il].bo,
8725
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8896
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8726
8897
  }
8727
8898
 
8728
8899
  if (il == n_layer - 1) {
@@ -8834,9 +9005,9 @@ struct llm_build_context {
8834
9005
  );
8835
9006
  cb(Kcur, "Kcur", il);
8836
9007
 
8837
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9008
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8838
9009
  model.layers[il].wo, model.layers[il].bo,
8839
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9010
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8840
9011
  }
8841
9012
 
8842
9013
  if (il == n_layer - 1) {
@@ -8989,9 +9160,9 @@ struct llm_build_context {
8989
9160
  );
8990
9161
  cb(Kcur, "Kcur", il);
8991
9162
 
8992
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9163
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8993
9164
  model.layers[il].wo, model.layers[il].bo,
8994
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9165
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8995
9166
  }
8996
9167
 
8997
9168
  if (il == n_layer - 1) {
@@ -9106,9 +9277,9 @@ struct llm_build_context {
9106
9277
  );
9107
9278
  cb(Kcur, "Kcur", il);
9108
9279
 
9109
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9110
- model.layers[il].wo, NULL,
9111
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9281
+ model.layers[il].wo, model.layers[il].bo,
9282
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9112
9283
  }
9113
9284
 
9114
9285
  if (il == n_layer - 1) {
@@ -9219,9 +9390,9 @@ struct llm_build_context {
9219
9390
  ext_factor, attn_factor, beta_fast, beta_slow);
9220
9391
  cb(Kcur, "Kcur", il);
9221
9392
 
9222
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9393
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9223
9394
  model.layers[il].wo, NULL,
9224
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9395
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9225
9396
  }
9226
9397
  struct ggml_tensor * sa_out = cur;
9227
9398
 
@@ -9322,9 +9493,9 @@ struct llm_build_context {
9322
9493
 
9323
9494
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9324
9495
 
9325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9326
9497
  model.layers[il].wo, model.layers[il].bo,
9327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9328
9499
  }
9329
9500
 
9330
9501
  if (il == n_layer - 1) {
@@ -9429,9 +9600,9 @@ struct llm_build_context {
9429
9600
  );
9430
9601
  cb(Kcur, "Kcur", il);
9431
9602
 
9432
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9603
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9433
9604
  model.layers[il].wo, model.layers[il].bo,
9434
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9605
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9435
9606
  }
9436
9607
 
9437
9608
  if (il == n_layer - 1) {
@@ -9545,9 +9716,9 @@ struct llm_build_context {
9545
9716
  );
9546
9717
  cb(Kcur, "Kcur", il);
9547
9718
 
9548
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9719
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9549
9720
  model.layers[il].wo, NULL,
9550
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9721
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9551
9722
  }
9552
9723
 
9553
9724
  if (il == n_layer - 1) {
@@ -9662,9 +9833,9 @@ struct llm_build_context {
9662
9833
  );
9663
9834
  cb(Kcur, "Kcur", il);
9664
9835
 
9665
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9836
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9666
9837
  model.layers[il].wo, model.layers[il].bo,
9667
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9838
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9668
9839
  }
9669
9840
 
9670
9841
  if (il == n_layer - 1) {
@@ -9792,9 +9963,9 @@ struct llm_build_context {
9792
9963
  );
9793
9964
  cb(Kcur, "Kcur", il);
9794
9965
 
9795
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9966
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9796
9967
  model.layers[il].wo, model.layers[il].bo,
9797
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9968
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9798
9969
  }
9799
9970
 
9800
9971
  if (il == n_layer - 1) {
@@ -9913,9 +10084,9 @@ struct llm_build_context {
9913
10084
  ext_factor, attn_factor, beta_fast, beta_slow);
9914
10085
  cb(Kcur, "Kcur", il);
9915
10086
 
9916
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10087
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9917
10088
  model.layers[il].wo, NULL,
9918
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10089
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9919
10090
  }
9920
10091
 
9921
10092
  if (il == n_layer - 1) {
@@ -10032,9 +10203,9 @@ struct llm_build_context {
10032
10203
  );
10033
10204
  cb(Kcur, "Kcur", il);
10034
10205
 
10035
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10206
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10036
10207
  model.layers[il].wo, model.layers[il].bo,
10037
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10208
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10038
10209
  }
10039
10210
 
10040
10211
  if (il == n_layer - 1) {
@@ -10322,9 +10493,9 @@ struct llm_build_context {
10322
10493
  );
10323
10494
  cb(Kcur, "Kcur", il);
10324
10495
 
10325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10326
10497
  model.layers[il].wo, model.layers[il].bo,
10327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10328
10499
  }
10329
10500
 
10330
10501
  if (il == n_layer - 1) {
@@ -10453,9 +10624,9 @@ struct llm_build_context {
10453
10624
  );
10454
10625
  cb(Kcur, "Kcur", il);
10455
10626
 
10456
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10627
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10457
10628
  model.layers[il].wo, nullptr,
10458
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10629
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10459
10630
  }
10460
10631
 
10461
10632
  if (il == n_layer - 1) {
@@ -10882,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
10882
11053
  }
10883
11054
  }
10884
11055
 
10885
- if (hparams.need_kq_pos) {
11056
+ // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11057
+ // this allows to process multiple sequences in parallel with ALiBi-based models
11058
+ if (hparams.use_alibi) {
10886
11059
  const int64_t n_kv = kv_self.n;
10887
11060
 
10888
11061
  GGML_ASSERT(lctx.inp_KQ_pos);
@@ -11264,7 +11437,7 @@ static int llama_decode_internal(
11264
11437
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11265
11438
  // after enough generations, the benefit from this heuristic disappears
11266
11439
  // if we start defragmenting the cache, the benefit from this will be more important
11267
- kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
11440
+ kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11268
11441
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11269
11442
  }
11270
11443
  }
@@ -11432,6 +11605,10 @@ static int llama_decode_internal(
11432
11605
  }
11433
11606
  }
11434
11607
 
11608
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
11609
+ // overlap with device computation.
11610
+ ggml_backend_sched_reset(lctx.sched);
11611
+
11435
11612
  return 0;
11436
11613
  }
11437
11614
 
@@ -11457,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
11457
11634
  // each move requires 6*n_layer tensors (see build_defrag)
11458
11635
  // - source view, destination view, copy operation
11459
11636
  // - x2 for keys and values
11460
- const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11637
+ //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11638
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11639
+ const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
11461
11640
 
11462
11641
  // determine which KV cells to move where
11463
11642
  //
@@ -11781,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11781
11960
  }
11782
11961
  case LLAMA_VOCAB_TYPE_BPE: {
11783
11962
  GGML_ASSERT(false);
11784
- return unicode_utf8_to_byte(token_data.text);
11963
+ return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
11785
11964
  }
11786
11965
  case LLAMA_VOCAB_TYPE_WPM: {
11787
11966
  GGML_ASSERT(false);
@@ -12003,7 +12182,79 @@ struct llm_tokenizer_bpe {
12003
12182
 
12004
12183
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12005
12184
  int final_prev_index = -1;
12006
- auto word_collection = bpe_gpt2_preprocess(text);
12185
+
12186
+ std::vector<std::string> word_collection;
12187
+ switch (vocab.type) {
12188
+ case LLAMA_VOCAB_TYPE_BPE:
12189
+ switch (vocab.type_pre) {
12190
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12191
+ word_collection = unicode_regex_split(text, {
12192
+ // original regex from tokenizer.json
12193
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12194
+
12195
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12196
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12197
+ });
12198
+ break;
12199
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12200
+ word_collection = unicode_regex_split(text, {
12201
+ "[\r\n]",
12202
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12203
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12204
+ "\\s+$",
12205
+ "[一-龥ࠀ-一가-퟿]+",
12206
+ "\\p{N}+",
12207
+ });
12208
+ break;
12209
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12210
+ word_collection = unicode_regex_split(text, {
12211
+ "[\r\n]",
12212
+ "\\s?\\p{L}+",
12213
+ "\\s?\\p{P}+",
12214
+ "[一-龥ࠀ-一가-퟿]+",
12215
+ "\\p{N}+",
12216
+ });
12217
+ break;
12218
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
12219
+ word_collection = unicode_regex_split(text, {
12220
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12221
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222
+ "\\p{N}+",
12223
+ "[0-9][0-9][0-9]",
12224
+ });
12225
+ break;
12226
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
12227
+ // TODO: MPT pre-tokenization regexes are unknown
12228
+ // the following are close, but not exact. run the following:
12229
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12230
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12231
+ word_collection = unicode_regex_split(text, {
12232
+ "\\s?\\p{L}+",
12233
+ "\\s?\\p{P}+",
12234
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12235
+ });
12236
+ break;
12237
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12238
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
12239
+ word_collection = unicode_regex_split(text, {
12240
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
+ });
12242
+ break;
12243
+ default:
12244
+ // default regex for BPE tokenization pre-processing
12245
+ word_collection = unicode_regex_split(text, {
12246
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12247
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12248
+ "\\p{N}+",
12249
+ "[0-9][0-9][0-9]",
12250
+ });
12251
+ break;
12252
+ }
12253
+ break;
12254
+ default:
12255
+ GGML_ASSERT(false);
12256
+ break;
12257
+ }
12007
12258
 
12008
12259
  symbols_final.clear();
12009
12260
 
@@ -12130,145 +12381,6 @@ private:
12130
12381
  work_queue.push(bigram);
12131
12382
  }
12132
12383
 
12133
- std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
12134
- std::vector<std::string> bpe_words;
12135
- std::vector<std::string> bpe_encoded_words;
12136
-
12137
- std::string token = "";
12138
- // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
12139
- bool collecting_numeric = false;
12140
- bool collecting_letter = false;
12141
- bool collecting_special = false;
12142
- bool collecting_whitespace_lookahead = false;
12143
- bool collecting = false;
12144
-
12145
- std::vector<std::string> text_utf;
12146
- text_utf.reserve(text.size());
12147
- bpe_words.reserve(text.size());
12148
- bpe_encoded_words.reserve(text.size());
12149
-
12150
- const auto cpts = unicode_cpts_from_utf8(text);
12151
- for (size_t i = 0; i < cpts.size(); ++i)
12152
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
12153
-
12154
- for (int i = 0; i < (int)text_utf.size(); i++) {
12155
- const std::string & utf_char = text_utf[i];
12156
- bool split_condition = false;
12157
- int bytes_remain = text_utf.size() - i;
12158
- // forward backward lookups
12159
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
12160
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
12161
-
12162
- // handling contractions
12163
- if (!split_condition && bytes_remain >= 2) {
12164
- // 's|'t|'m|'d
12165
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
12166
- split_condition = true;
12167
- }
12168
- if (split_condition) {
12169
- if (token.size()) {
12170
- bpe_words.emplace_back(token); // push previous content as token
12171
- }
12172
- token = utf_char + utf_char_next;
12173
- bpe_words.emplace_back(token);
12174
- token = "";
12175
- i++;
12176
- continue;
12177
- }
12178
- }
12179
- if (!split_condition && bytes_remain >= 3) {
12180
- // 're|'ve|'ll
12181
- if (utf_char == "\'" && (
12182
- (utf_char_next == "r" && utf_char_next_next == "e") ||
12183
- (utf_char_next == "v" && utf_char_next_next == "e") ||
12184
- (utf_char_next == "l" && utf_char_next_next == "l"))
12185
- ) {
12186
- split_condition = true;
12187
- }
12188
- if (split_condition) {
12189
- // current token + next token can be defined
12190
- if (token.size()) {
12191
- bpe_words.emplace_back(token); // push previous content as token
12192
- }
12193
- token = utf_char + utf_char_next + utf_char_next_next;
12194
- bpe_words.emplace_back(token); // the contraction
12195
- token = "";
12196
- i += 2;
12197
- continue;
12198
- }
12199
- }
12200
-
12201
- if (!split_condition && !collecting) {
12202
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
12203
- collecting_letter = true;
12204
- collecting = true;
12205
- }
12206
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12207
- collecting_numeric = true;
12208
- collecting = true;
12209
- }
12210
- else if (
12211
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
12212
- (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
12213
- ) {
12214
- collecting_special = true;
12215
- collecting = true;
12216
- }
12217
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
12218
- collecting_whitespace_lookahead = true;
12219
- collecting = true;
12220
- }
12221
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
12222
- split_condition = true;
12223
- }
12224
- }
12225
- else if (!split_condition && collecting) {
12226
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
12227
- split_condition = true;
12228
- }
12229
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
12230
- split_condition = true;
12231
- }
12232
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
12233
- split_condition = true;
12234
- }
12235
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12236
- split_condition = true;
12237
- }
12238
- }
12239
-
12240
- if (utf_char_next == "") {
12241
- split_condition = true; // final
12242
- token += utf_char;
12243
- }
12244
-
12245
- if (split_condition) {
12246
- if (token.size()) {
12247
- bpe_words.emplace_back(token);
12248
- }
12249
- token = utf_char;
12250
- collecting = false;
12251
- collecting_letter = false;
12252
- collecting_numeric = false;
12253
- collecting_special = false;
12254
- collecting_whitespace_lookahead = false;
12255
- }
12256
- else {
12257
- token += utf_char;
12258
- }
12259
- }
12260
-
12261
- for (std::string & word : bpe_words) {
12262
- std::string encoded_token = "";
12263
- for (char & c : word) {
12264
- encoded_token += unicode_byte_to_utf8(c);
12265
- }
12266
- bpe_encoded_words.emplace_back(encoded_token);
12267
- }
12268
-
12269
- return bpe_encoded_words;
12270
- }
12271
-
12272
12384
  const llama_vocab & vocab;
12273
12385
 
12274
12386
  std::vector<llm_symbol> symbols;
@@ -12588,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12588
12700
  } break;
12589
12701
  case LLAMA_VOCAB_TYPE_BPE:
12590
12702
  {
12591
- if (add_special && vocab.special_add_bos == 1) {
12703
+ if (add_special && vocab.special_add_bos != 0) {
12592
12704
  GGML_ASSERT(vocab.special_bos_id != -1);
12593
12705
  output.push_back(vocab.special_bos_id);
12594
12706
  }
@@ -14360,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14360
14472
  }
14361
14473
 
14362
14474
  static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14363
- std::mutex mutex;
14364
- int64_t counter = 0;
14365
- size_t new_size = 0;
14366
14475
  if (nthread < 2) {
14367
14476
  // single-thread
14368
- return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14477
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14478
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
14479
+ throw std::runtime_error("quantized data validation failed");
14480
+ }
14481
+ return new_size;
14369
14482
  }
14370
- auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
14483
+
14484
+ std::mutex mutex;
14485
+ int64_t counter = 0;
14486
+ size_t new_size = 0;
14487
+ bool valid = true;
14488
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
14371
14489
  nrows, n_per_row, imatrix]() {
14372
14490
  const int64_t nrows_per_chunk = chunk_size / n_per_row;
14373
14491
  size_t local_size = 0;
@@ -14382,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14382
14500
  }
14383
14501
  lock.unlock();
14384
14502
  const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14385
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14503
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14504
+ local_size += this_size;
14505
+
14506
+ // validate the quantized data
14507
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
14508
+ void * this_data = (char *) new_data + first_row * row_size;
14509
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
14510
+ std::unique_lock<std::mutex> lock(mutex);
14511
+ valid = false;
14512
+ break;
14513
+ }
14386
14514
  }
14387
14515
  };
14388
14516
  for (int it = 0; it < nthread - 1; ++it) {
@@ -14391,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14391
14519
  compute();
14392
14520
  for (auto & w : workers) { w.join(); }
14393
14521
  workers.clear();
14522
+ if (!valid) {
14523
+ throw std::runtime_error("quantized data validation failed");
14524
+ }
14394
14525
  return new_size;
14395
14526
  }
14396
14527
 
@@ -14453,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14453
14584
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
14454
14585
  kv_overrides = v->data();
14455
14586
  }
14456
- llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
14587
+ llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
14457
14588
  ml.init_mappings(false); // no prefetching
14458
14589
 
14459
14590
  llama_model model;
@@ -14491,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14491
14622
  for (auto & o : overrides) {
14492
14623
  if (o.key[0] == 0) break;
14493
14624
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
14494
- gguf_set_val_f32(ctx_out, o.key, o.float_value);
14625
+ gguf_set_val_f32(ctx_out, o.key, o.val_f64);
14495
14626
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
14496
- gguf_set_val_i32(ctx_out, o.key, o.int_value);
14627
+ gguf_set_val_i32(ctx_out, o.key, o.val_i64);
14497
14628
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
14498
- gguf_set_val_bool(ctx_out, o.key, o.bool_value);
14629
+ gguf_set_val_bool(ctx_out, o.key, o.val_bool);
14630
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
14631
+ gguf_set_val_str(ctx_out, o.key, o.val_str);
14499
14632
  } else {
14500
14633
  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
14501
14634
  }
@@ -14814,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
14814
14947
  std::unique_ptr<llama_model_loader> ml;
14815
14948
  if (path_base_model) {
14816
14949
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
14817
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
14950
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
14818
14951
  ml->init_mappings(/*prefetch*/ false); // no prefetching
14819
14952
  }
14820
14953
 
@@ -15073,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
15073
15206
  /*.vocab_only =*/ false,
15074
15207
  /*.use_mmap =*/ true,
15075
15208
  /*.use_mlock =*/ false,
15209
+ /*.check_tensors =*/ false,
15076
15210
  };
15077
15211
 
15078
15212
  #ifdef GGML_USE_METAL
@@ -15109,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
15109
15243
  /*.logits_all =*/ false,
15110
15244
  /*.embeddings =*/ false,
15111
15245
  /*.offload_kqv =*/ true,
15246
+ /*.flash_attn =*/ false,
15112
15247
  /*.abort_callback =*/ nullptr,
15113
15248
  /*.abort_callback_data =*/ nullptr,
15114
15249
  };
@@ -15275,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
15275
15410
  cparams.defrag_thold = params.defrag_thold;
15276
15411
  cparams.embeddings = params.embeddings;
15277
15412
  cparams.offload_kqv = params.offload_kqv;
15413
+ cparams.flash_attn = params.flash_attn;
15278
15414
  cparams.pooling_type = params.pooling_type;
15279
15415
 
15280
15416
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -15282,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
15282
15418
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15283
15419
 
15284
15420
  // this is necessary due to kv_self.n being padded later during inference
15285
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
15421
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15286
15422
 
15287
15423
  // with causal attention, the batch size is limited by the context size
15288
15424
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
15289
- cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15290
15425
 
15426
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
15427
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
15428
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
15429
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
15430
+ LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
15431
+ cparams.n_batch = GGML_KQ_MASK_PAD;
15432
+ }
15433
+
15434
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15291
15435
 
15292
15436
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15293
15437
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
@@ -15319,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
15319
15463
  }
15320
15464
  }
15321
15465
 
15466
+ if (cparams.flash_attn && hparams.use_alibi) {
15467
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15468
+ cparams.flash_attn = false;
15469
+ }
15470
+
15471
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15472
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15473
+ cparams.flash_attn = false;
15474
+ }
15475
+
15476
+ #ifdef GGML_USE_HIPBLAS
15477
+ if (cparams.flash_attn) {
15478
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
15479
+ cparams.flash_attn = false;
15480
+ }
15481
+ #endif
15482
+
15322
15483
  if (params.seed == LLAMA_DEFAULT_SEED) {
15323
15484
  params.seed = time(NULL);
15324
15485
  }
@@ -15326,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
15326
15487
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
15327
15488
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
15328
15489
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
15490
+ LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
15329
15491
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
15330
15492
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
15331
15493
 
@@ -15454,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
15454
15616
  }
15455
15617
  ctx->backends.push_back(ctx->backend_cpu);
15456
15618
 
15457
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
15619
+ if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
15458
15620
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
15459
15621
  llama_free(ctx);
15460
15622
  return nullptr;
@@ -16053,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16053
16215
  const size_t s_kv_head = sizeof(uint32_t);
16054
16216
  const size_t s_kv_size = sizeof(uint32_t);
16055
16217
  const size_t s_kv_used = sizeof(uint32_t);
16218
+ const size_t s_v_trans = sizeof(uint32_t);
16056
16219
  const size_t s_kv = ctx->kv_self.total_size();
16057
16220
  const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
16058
16221
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
@@ -16070,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16070
16233
  + s_kv_head
16071
16234
  + s_kv_size
16072
16235
  + s_kv_used
16236
+ + s_v_trans
16073
16237
  + s_kv
16074
16238
  + s_kv_cells
16075
16239
  );
16076
16240
 
16241
+ // on session change it is very likely that the state size has changed - so we need to update this function
16242
+ static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
16243
+
16077
16244
  return s_total;
16078
16245
  }
16079
16246
 
@@ -16219,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16219
16386
  const uint32_t kv_size = kv_self.size;
16220
16387
  const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
16221
16388
  const uint32_t kv_used = kv_self.used;
16389
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
16222
16390
 
16223
16391
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
16224
16392
  data_ctx->write(&kv_head, sizeof(kv_head));
16225
16393
  data_ctx->write(&kv_size, sizeof(kv_size));
16226
16394
  data_ctx->write(&kv_used, sizeof(kv_used));
16395
+ data_ctx->write(&v_trans, sizeof(v_trans));
16227
16396
 
16228
16397
  if (kv_buf_size) {
16229
16398
  const size_t pre_kv_buf_size = data_ctx->get_size_written();
@@ -16236,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16236
16405
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
16237
16406
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
16238
16407
 
16239
- if (kv_self.recurrent) {
16408
+ if (kv_self.recurrent || !kv_self.v_trans) {
16240
16409
  // v is contiguous for recurrent models
16241
16410
  // TODO: use other tensors for state models than k and v
16242
16411
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16369,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16369
16538
  uint32_t kv_head;
16370
16539
  uint32_t kv_size;
16371
16540
  uint32_t kv_used;
16541
+ uint32_t v_trans;
16372
16542
 
16373
16543
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
16374
16544
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
16375
16545
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
16376
16546
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
16547
+ memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
16548
+
16549
+ GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
16377
16550
 
16378
16551
  if (kv_self.size != kv_size) {
16379
16552
  // the KV cache needs to be big enough to load all the KV cells from the saved state
@@ -16383,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16383
16556
  __func__, kv_head, kv_size, kv_self.size);
16384
16557
  }
16385
16558
 
16559
+ llama_kv_cache_clear(ctx);
16560
+
16386
16561
  if (kv_buf_size) {
16387
16562
  const size_t pre_kv_buf_size = inp - src;
16388
16563
 
@@ -16394,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16394
16569
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
16395
16570
  inp += k_size;
16396
16571
 
16397
- if (kv_self.recurrent) {
16572
+ if (kv_self.recurrent || !kv_self.v_trans) {
16398
16573
  // v is contiguous for recurrent models
16399
16574
  // TODO: use other tensors for state models than k and v
16400
16575
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16416,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16416
16591
  GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
16417
16592
  }
16418
16593
 
16419
- llama_kv_cache_clear(ctx);
16420
-
16421
16594
  ctx->kv_self.head = kv_head;
16422
16595
  ctx->kv_self.used = kv_used;
16423
16596
 
@@ -16677,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16677
16850
  }
16678
16851
  }
16679
16852
 
16680
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16681
- const uint32_t kv_size = kv_self.size;
16682
- for (int il = 0; il < (int)n_layer; ++il) {
16683
- // Write value type
16684
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16685
- data_ctx.write(&v_type_i, sizeof(v_type_i));
16853
+ // TODO: simplify, reduce copy-paste
16854
+ if (!kv_self.v_trans) {
16855
+ for (int il = 0; il < (int)n_layer; ++il) {
16856
+ // Write value type
16857
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16858
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16686
16859
 
16687
- // Write element size
16688
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16689
- data_ctx.write(&v_size_el, sizeof(v_size_el));
16860
+ // Write row size of value
16861
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
16862
+ data_ctx.write(&v_size_row, sizeof(v_size_row));
16690
16863
 
16691
- // For each row, we get the element values of each cell
16692
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16693
- // Read each range of cells of v_size_el length each into tmp_buf and write out
16864
+ // Read each range of cells of v_size length each into tmp_buf and write out
16694
16865
  for (const auto & range : cell_ranges) {
16695
16866
  const size_t range_size = range.second - range.first;
16696
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16697
- tmp_buf.resize(range_size * v_size_el);
16698
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16867
+ tmp_buf.resize(range_size * v_size_row);
16868
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
16699
16869
  data_ctx.write(tmp_buf.data(), tmp_buf.size());
16700
16870
  }
16701
16871
  }
16872
+ } else {
16873
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16874
+ const uint32_t kv_size = kv_self.size;
16875
+ for (int il = 0; il < (int)n_layer; ++il) {
16876
+ // Write value type
16877
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16878
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16879
+
16880
+ // Write element size
16881
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16882
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16883
+
16884
+ // For each row, we get the element values of each cell
16885
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16886
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16887
+ for (const auto & range : cell_ranges) {
16888
+ const size_t range_size = range.second - range.first;
16889
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16890
+ tmp_buf.resize(range_size * v_size_el);
16891
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16892
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16893
+ }
16894
+ }
16895
+ }
16702
16896
  }
16703
16897
 
16704
16898
  return data_ctx.get_size_written();
@@ -16823,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
16823
17017
  }
16824
17018
  }
16825
17019
 
16826
- // For each layer, read the values for each cell (transposed)
16827
- for (int il = 0; il < (int)n_layer; ++il) {
16828
- // Read type of value
16829
- int32_t v_type_i_ref;
16830
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16831
- inp += sizeof(v_type_i_ref);
16832
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16833
- if (v_type_i != v_type_i_ref) {
16834
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16835
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16836
- return 0;
16837
- }
17020
+ // TODO: simplify, reduce copy-paste
17021
+ if (!kv_self.v_trans) {
17022
+ for (int il = 0; il < (int)n_layer; ++il) {
17023
+ // Read type of value
17024
+ int32_t v_type_i_ref;
17025
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17026
+ inp += sizeof(v_type_i_ref);
17027
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17028
+ if (v_type_i != v_type_i_ref) {
17029
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17030
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17031
+ return 0;
17032
+ }
16838
17033
 
16839
- // Read element size of value
16840
- size_t v_size_el_ref;
16841
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16842
- inp += sizeof(v_size_el_ref);
16843
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16844
- if (v_size_el != v_size_el_ref) {
16845
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16846
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16847
- return 0;
16848
- }
17034
+ // Read row size of value
17035
+ size_t v_size_row_ref;
17036
+ memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
17037
+ inp += sizeof(v_size_row_ref);
17038
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
17039
+ if (v_size_row != v_size_row_ref) {
17040
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17041
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
17042
+ return 0;
17043
+ }
16849
17044
 
16850
- if (cell_count) {
16851
- // For each row in the transposed matrix, read the values for the whole cell range
16852
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16853
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16854
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16855
- inp += cell_count * v_size_el;
17045
+ if (cell_count) {
17046
+ // Read and set the values for the whole cell range
17047
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
17048
+ inp += cell_count * v_size_row;
17049
+ }
17050
+ }
17051
+ } else {
17052
+ // For each layer, read the values for each cell (transposed)
17053
+ for (int il = 0; il < (int)n_layer; ++il) {
17054
+ // Read type of value
17055
+ int32_t v_type_i_ref;
17056
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17057
+ inp += sizeof(v_type_i_ref);
17058
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17059
+ if (v_type_i != v_type_i_ref) {
17060
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17061
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17062
+ return 0;
17063
+ }
17064
+
17065
+ // Read element size of value
17066
+ size_t v_size_el_ref;
17067
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
17068
+ inp += sizeof(v_size_el_ref);
17069
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
17070
+ if (v_size_el != v_size_el_ref) {
17071
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17072
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
17073
+ return 0;
17074
+ }
17075
+
17076
+ if (cell_count) {
17077
+ // For each row in the transposed matrix, read the values for the whole cell range
17078
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
17079
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
17080
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
17081
+ inp += cell_count * v_size_el;
17082
+ }
16856
17083
  }
16857
17084
  }
16858
17085
  }
16859
17086
 
16860
17087
  const size_t nread = inp - src;
17088
+
16861
17089
  return nread;
16862
17090
  }
16863
17091
 
@@ -17654,9 +17882,9 @@ const char * llama_print_system_info(void) {
17654
17882
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17655
17883
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
17884
  #ifdef GGML_USE_LLAMAFILE
17657
- s += "LAMMAFILE = 1 | ";
17885
+ s += "LLAMAFILE = 1 | ";
17658
17886
  #else
17659
- s += "LAMMAFILE = 0 | ";
17887
+ s += "LLAMAFILE = 0 | ";
17660
17888
  #endif
17661
17889
 
17662
17890
  return s.c_str();