llama_cpp 0.14.7 → 0.15.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -75,6 +75,7 @@
75
75
  #include <forward_list>
76
76
  #include <fstream>
77
77
  #include <functional>
78
+ #include <future>
78
79
  #include <initializer_list>
79
80
  #include <locale>
80
81
  #include <map>
@@ -107,7 +108,6 @@
107
108
  #define LLAMA_MAX_NODES 8192
108
109
  #define LLAMA_MAX_EXPERTS 60
109
110
 
110
-
111
111
  //
112
112
  // logging
113
113
  //
@@ -316,6 +316,7 @@ enum llm_kv {
316
316
  LLM_KV_SSM_TIME_STEP_RANK,
317
317
 
318
318
  LLM_KV_TOKENIZER_MODEL,
319
+ LLM_KV_TOKENIZER_PRE,
319
320
  LLM_KV_TOKENIZER_LIST,
320
321
  LLM_KV_TOKENIZER_TOKEN_TYPE,
321
322
  LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
392
393
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
393
394
 
394
395
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
396
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
395
397
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
396
398
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
397
399
  { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@@ -1843,7 +1845,7 @@ struct llama_hparams {
1843
1845
  float f_logit_scale = 0.0f;
1844
1846
 
1845
1847
  bool causal_attn = true;
1846
- bool need_kq_pos = false;
1848
+ bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1847
1849
 
1848
1850
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1849
1851
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1933,6 +1935,7 @@ struct llama_cparams {
1933
1935
  bool embeddings;
1934
1936
  bool causal_attn;
1935
1937
  bool offload_kqv;
1938
+ bool flash_attn;
1936
1939
 
1937
1940
  enum llama_pooling_type pooling_type;
1938
1941
 
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
2036
2039
  bool has_shift = false;
2037
2040
  bool do_defrag = false;
2038
2041
  bool do_copy = false;
2039
- // with recurrent state models, a cell can hold the state for more than one past token
2040
- bool recurrent = false;
2042
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
2043
+ bool v_trans = true; // the value tensor is transposed
2041
2044
 
2042
2045
  // Note: The value of head isn't only used to optimize searching
2043
2046
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2114,7 +2117,8 @@ struct llama_vocab {
2114
2117
  ttype type;
2115
2118
  };
2116
2119
 
2117
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2120
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2121
+ enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2118
2122
 
2119
2123
  std::unordered_map<token, id> token_to_id;
2120
2124
  std::vector<token_data> id_to_token;
@@ -2335,11 +2339,14 @@ struct llama_context {
2335
2339
 
2336
2340
  static bool llama_kv_cache_init(
2337
2341
  struct llama_kv_cache & cache,
2338
- const llama_model & model,
2342
+ const llama_context * ctx,
2339
2343
  ggml_type type_k,
2340
2344
  ggml_type type_v,
2341
2345
  uint32_t kv_size,
2342
2346
  bool offload) {
2347
+ const llama_model & model = ctx->model;
2348
+ const llama_cparams & cparams = ctx->cparams;
2349
+
2343
2350
  const struct llama_hparams & hparams = model.hparams;
2344
2351
 
2345
2352
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
2350
2357
 
2351
2358
  // TODO: find a nicer way to add other recurrent model architectures
2352
2359
  cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2360
+ cache.v_trans = !cparams.flash_attn;
2353
2361
 
2354
- // TODO: support mixed reccurent Transformer architectues
2362
+ // TODO: support mixed recurrent Transformer architectures
2355
2363
  // NOTE: (!a || b) is a logical implication (a -> b)
2356
2364
  GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2357
2365
  GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2562
2570
  }
2563
2571
  cache.head = 0;
2564
2572
  cache.used = 0;
2573
+
2574
+ for (auto & buf : cache.bufs) {
2575
+ ggml_backend_buffer_clear(buf, 0);
2576
+ }
2565
2577
  }
2566
2578
 
2567
2579
  static bool llama_kv_cache_seq_rm(
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
2882
2894
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2883
2895
  case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2884
2896
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2897
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
2885
2898
  }
2886
2899
  return "unknown";
2887
2900
  }
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
2893
2906
  __func__, override_type_to_str(ovrd->tag), ovrd->key);
2894
2907
  switch (ovrd->tag) {
2895
2908
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2896
- LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2909
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
2897
2910
  } break;
2898
2911
  case LLAMA_KV_OVERRIDE_TYPE_INT: {
2899
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2912
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
2900
2913
  } break;
2901
2914
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2902
- LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2915
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
2916
+ } break;
2917
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
2918
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
2903
2919
  } break;
2904
2920
  default:
2905
2921
  // Shouldn't be possible to end up here, but just in case...
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
2918
2934
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2919
2935
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2920
2936
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2921
- target = ovrd->bool_value;
2937
+ target = ovrd->val_bool;
2922
2938
  return true;
2923
2939
  }
2924
2940
  return false;
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
2928
2944
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2929
2945
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2930
2946
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2931
- target = ovrd->int_value;
2947
+ target = ovrd->val_i64;
2932
2948
  return true;
2933
2949
  }
2934
2950
  return false;
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
2938
2954
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2939
2955
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2940
2956
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2941
- target = ovrd->float_value;
2957
+ target = ovrd->val_f64;
2942
2958
  return true;
2943
2959
  }
2944
2960
  return false;
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
2947
2963
  template<typename OT>
2948
2964
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2949
2965
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2950
- (void)target;
2951
- (void)ovrd;
2952
- if (!ovrd) { return false; }
2953
- // Currently, we should never end up here so it would be a bug if we do.
2954
- throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2955
- ovrd ? ovrd->key : "NULL"));
2966
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
2967
+ target = ovrd->val_str;
2968
+ return true;
2969
+ }
2970
+ return false;
2956
2971
  }
2957
2972
 
2958
2973
  static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
2985
3000
  size_t n_bytes = 0;
2986
3001
 
2987
3002
  bool use_mmap = false;
3003
+ bool check_tensors;
2988
3004
 
2989
3005
  llama_files files;
2990
3006
  llama_ftype ftype;
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
3018
3034
  std::string arch_name;
3019
3035
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
3020
3036
 
3021
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
3037
+ llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
3022
3038
  int trace = 0;
3023
3039
  if (getenv("LLAMA_TRACE")) {
3024
3040
  trace = atoi(getenv("LLAMA_TRACE"));
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
3115
3131
 
3116
3132
  fver = (enum llama_fver) gguf_get_version(meta);
3117
3133
 
3134
+ std::set<std::string> tensor_names;
3118
3135
  for (auto & w : weights) {
3119
3136
  n_elements += ggml_nelements(w.tensor);
3120
3137
  n_bytes += ggml_nbytes(w.tensor);
3138
+ // make sure there is no duplicated tensor names
3139
+ const std::string name(w.tensor->name);
3140
+ auto found = tensor_names.find(name);
3141
+ if (found != tensor_names.end()) {
3142
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
3143
+ }
3144
+ tensor_names.insert(name);
3121
3145
  }
3122
3146
 
3123
3147
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -3151,6 +3175,7 @@ struct llama_model_loader {
3151
3175
  switch (type_max) {
3152
3176
  case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
3153
3177
  case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3178
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
3154
3179
  case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
3155
3180
  case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
3156
3181
  case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3223,6 +3248,7 @@ struct llama_model_loader {
3223
3248
  }
3224
3249
 
3225
3250
  this->use_mmap = use_mmap;
3251
+ this->check_tensors = check_tensors;
3226
3252
  }
3227
3253
 
3228
3254
  ~llama_model_loader() {
@@ -3481,6 +3507,10 @@ struct llama_model_loader {
3481
3507
  file->seek(w.offs, SEEK_SET);
3482
3508
  file->read_raw(cur->data, ggml_nbytes(cur));
3483
3509
  }
3510
+
3511
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
3512
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3513
+ }
3484
3514
  }
3485
3515
 
3486
3516
  size_t size_done = 0;
@@ -3497,6 +3527,8 @@ struct llama_model_loader {
3497
3527
  GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3498
3528
 
3499
3529
  std::vector<no_init<uint8_t>> read_buf;
3530
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3531
+
3500
3532
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3501
3533
  const auto * weight = get_weight(ggml_get_name(cur));
3502
3534
  if (weight == nullptr) {
@@ -3518,37 +3550,66 @@ struct llama_model_loader {
3518
3550
  if (bufs_mmap.count(weight->idx)) {
3519
3551
  buf_mmap = bufs_mmap.at(weight->idx);
3520
3552
  }
3553
+ uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
3554
+
3555
+ if (check_tensors) {
3556
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
3557
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
3558
+ }));
3559
+ }
3560
+
3521
3561
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3522
3562
  if (buf_mmap && cur->data == nullptr) {
3523
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3563
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
3524
3564
  if (lmlocks) {
3525
3565
  const auto & lmlock = lmlocks->at(weight->idx);
3526
- lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3566
+ lmlock->grow_to(weight->offs + n_size);
3527
3567
  }
3528
3568
 
3529
3569
  auto & mmap_used = mmaps_used[weight->idx];
3530
3570
  mmap_used.first = std::min(mmap_used.first, weight->offs);
3531
3571
  mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3532
3572
  } else {
3533
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3573
+ ggml_backend_tensor_set(cur, data, 0, n_size);
3534
3574
  }
3535
3575
  } else {
3536
3576
  GGML_ASSERT(weight->idx < files.size());
3537
3577
  const auto & file = files.at(weight->idx);
3538
3578
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3539
3579
  file->seek(weight->offs, SEEK_SET);
3540
- file->read_raw(cur->data, ggml_nbytes(cur));
3580
+ file->read_raw(cur->data, n_size);
3581
+ if (check_tensors) {
3582
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3583
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
3584
+ }));
3585
+ }
3541
3586
  } else {
3542
- read_buf.resize(ggml_nbytes(cur));
3587
+ read_buf.resize(n_size);
3543
3588
  file->seek(weight->offs, SEEK_SET);
3544
- file->read_raw(read_buf.data(), ggml_nbytes(cur));
3589
+ file->read_raw(read_buf.data(), n_size);
3545
3590
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3591
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3592
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3593
+ }
3546
3594
  }
3547
3595
  }
3548
3596
 
3549
3597
  size_done += n_size;
3550
3598
  }
3551
3599
 
3600
+ // check validation results
3601
+ bool validation_failed = false;
3602
+ for (auto & future : validation_result) {
3603
+ auto result = future.get();
3604
+ if (!result.second) {
3605
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
3606
+ validation_failed = true;
3607
+ }
3608
+ }
3609
+ if (validation_failed) {
3610
+ throw std::runtime_error("found tensors with invalid data");
3611
+ }
3612
+
3552
3613
  // check if this is the last call and do final cleanup
3553
3614
  if (size_done >= size_data) {
3554
3615
  // unmap offloaded tensors and metadata
@@ -3606,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3606
3667
  switch (ftype) {
3607
3668
  case LLAMA_FTYPE_ALL_F32: return "all F32";
3608
3669
  case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3670
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
3609
3671
  case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
3610
3672
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
3611
3673
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -4142,7 +4204,7 @@ static void llm_load_hparams(
4142
4204
  model.ftype = ml.ftype;
4143
4205
 
4144
4206
  if (hparams.f_max_alibi_bias > 0.0f) {
4145
- hparams.need_kq_pos = true;
4207
+ hparams.use_alibi = true;
4146
4208
  }
4147
4209
 
4148
4210
  hparams.rope_type = llama_rope_type(&model);
@@ -4165,11 +4227,13 @@ static void llm_load_vocab(
4165
4227
 
4166
4228
  // determine vocab type
4167
4229
  {
4168
- std::string tokenizer_name;
4230
+ std::string tokenizer_model;
4231
+ std::string tokenizer_pre;
4169
4232
 
4170
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
4233
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
4234
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4171
4235
 
4172
- if (tokenizer_name == "no_vocab") {
4236
+ if (tokenizer_model == "no_vocab") {
4173
4237
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
4174
4238
 
4175
4239
  // default special tokens
@@ -4183,7 +4247,7 @@ static void llm_load_vocab(
4183
4247
  vocab.linefeed_id = -1;
4184
4248
 
4185
4249
  return;
4186
- } else if (tokenizer_name == "llama") {
4250
+ } else if (tokenizer_model == "llama") {
4187
4251
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4188
4252
 
4189
4253
  // default special tokens
@@ -4228,9 +4292,27 @@ static void llm_load_vocab(
4228
4292
  if (add_space_prefix_keyidx != -1) {
4229
4293
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4230
4294
  } // The default value of add_space_prefix is true.
4231
- } else if (tokenizer_name == "gpt2") {
4232
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4295
+ } else if (tokenizer_model == "bert") {
4296
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4233
4297
 
4298
+ // default special tokens
4299
+ vocab.special_bos_id = -1;
4300
+ vocab.special_eos_id = -1;
4301
+ vocab.special_unk_id = 100;
4302
+ vocab.special_sep_id = 102;
4303
+ vocab.special_pad_id = 0;
4304
+ vocab.special_cls_id = 101;
4305
+ vocab.special_mask_id = 103;
4306
+ vocab.add_space_prefix = false;
4307
+ } else {
4308
+ if (tokenizer_model == "gpt2") {
4309
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4310
+ } else {
4311
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4312
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4313
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
4314
+ return;
4315
+ }
4234
4316
  // read bpe merges and populate bpe ranks
4235
4317
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4236
4318
  if (merges_keyidx == -1) {
@@ -4264,23 +4346,65 @@ static void llm_load_vocab(
4264
4346
  vocab.special_pad_id = -1;
4265
4347
  vocab.special_cls_id = -1;
4266
4348
  vocab.special_mask_id = -1;
4267
- } else if (tokenizer_name == "bert") {
4268
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4349
+ }
4269
4350
 
4270
- // default special tokens
4271
- vocab.special_bos_id = -1;
4272
- vocab.special_eos_id = -1;
4273
- vocab.special_unk_id = 100;
4274
- vocab.special_sep_id = 102;
4275
- vocab.special_pad_id = 0;
4276
- vocab.special_cls_id = 101;
4277
- vocab.special_mask_id = 103;
4278
- vocab.add_space_prefix = false;
4351
+ // for now, only BPE models have pre-tokenizers
4352
+ if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4353
+ if (tokenizer_pre.empty()) {
4354
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4355
+ LLAMA_LOG_WARN("%s: \n", __func__);
4356
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4357
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4358
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4359
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4360
+ LLAMA_LOG_WARN("%s: \n", __func__);
4361
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4362
+ } else if (
4363
+ tokenizer_pre == "default") {
4364
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4365
+ } else if (
4366
+ tokenizer_pre == "llama3" ||
4367
+ tokenizer_pre == "llama-v3" ||
4368
+ tokenizer_pre == "llama-bpe") {
4369
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4370
+ } else if (
4371
+ tokenizer_pre == "deepseek-llm") {
4372
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4373
+ } else if (
4374
+ tokenizer_pre == "deepseek-coder") {
4375
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4376
+ } else if (
4377
+ tokenizer_pre == "falcon") {
4378
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4379
+ } else if (
4380
+ tokenizer_pre == "mpt") {
4381
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4382
+ } else if (
4383
+ tokenizer_pre == "starcoder") {
4384
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
+ } else if (
4386
+ tokenizer_pre == "gpt-2") {
4387
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
+ } else if (
4389
+ tokenizer_pre == "refact") {
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
4391
+ } else if (
4392
+ tokenizer_pre == "command-r") {
4393
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
4394
+ } else if (
4395
+ tokenizer_pre == "qwen2") {
4396
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4397
+ } else if (
4398
+ tokenizer_pre == "olmo") {
4399
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4400
+ } else if (
4401
+ tokenizer_pre == "dbrx") {
4402
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4403
+ } else {
4404
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4405
+ }
4279
4406
  } else {
4280
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4281
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4282
-
4283
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4407
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4284
4408
  }
4285
4409
  }
4286
4410
 
@@ -5975,7 +6099,7 @@ static bool llm_load_tensors(
5975
6099
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
5976
6100
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
5977
6101
  try {
5978
- llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
6102
+ llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
5979
6103
 
5980
6104
  model.hparams.vocab_only = params.vocab_only;
5981
6105
 
@@ -6013,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6013
6137
  || !(
6014
6138
  model.ftype == LLAMA_FTYPE_ALL_F32 ||
6015
6139
  model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
6140
+ model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
6016
6141
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
6017
6142
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
6018
6143
  )
@@ -6104,37 +6229,47 @@ static struct ggml_tensor * llm_build_inp_embd(
6104
6229
  static void llm_build_kv_store(
6105
6230
  struct ggml_context * ctx,
6106
6231
  const llama_hparams & hparams,
6232
+ const llama_cparams & cparams,
6107
6233
  const llama_kv_cache & kv,
6108
6234
  struct ggml_cgraph * graph,
6109
6235
  struct ggml_tensor * k_cur,
6110
6236
  struct ggml_tensor * v_cur,
6111
- int64_t n_ctx,
6112
6237
  int32_t n_tokens,
6113
6238
  int32_t kv_head,
6114
6239
  const llm_build_cb & cb,
6115
6240
  int64_t il) {
6241
+ const int64_t n_ctx = cparams.n_ctx;
6242
+
6116
6243
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6117
6244
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6118
6245
 
6119
6246
  GGML_ASSERT(kv.size == n_ctx);
6120
6247
 
6121
- // compute the transposed [n_tokens, n_embd] V matrix
6122
- assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6123
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
6124
- cb(v_cur_t, "v_cur_t", il);
6125
-
6126
6248
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
6127
6249
  (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
6128
6250
  cb(k_cache_view, "k_cache_view", il);
6129
6251
 
6130
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6131
- ( n_ctx)*ggml_element_size(kv.v_l[il]),
6132
- (kv_head)*ggml_element_size(kv.v_l[il]));
6252
+ // note: storing RoPE-ed version of K in the KV cache
6253
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6254
+
6255
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6256
+
6257
+ struct ggml_tensor * v_cache_view = nullptr;
6258
+
6259
+ if (cparams.flash_attn) {
6260
+ v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
6261
+ (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
6262
+ } else {
6263
+ // note: the V cache is transposed when not using flash attention
6264
+ v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6265
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
6266
+ (kv_head)*ggml_element_size(kv.v_l[il]));
6267
+
6268
+ v_cur = ggml_transpose(ctx, v_cur);
6269
+ }
6133
6270
  cb(v_cache_view, "v_cache_view", il);
6134
6271
 
6135
- // important: storing RoPE-ed version of K in the KV cache!
6136
- ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6137
- ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
6272
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
6138
6273
  }
6139
6274
 
6140
6275
  static struct ggml_tensor * llm_build_norm(
@@ -6354,11 +6489,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
6354
6489
  return moe_out;
6355
6490
  }
6356
6491
 
6357
- // if max_alibi_bias > 0 then apply ALiBi
6358
6492
  static struct ggml_tensor * llm_build_kqv(
6359
6493
  struct ggml_context * ctx,
6360
6494
  const llama_model & model,
6361
6495
  const llama_hparams & hparams,
6496
+ const llama_cparams & cparams,
6362
6497
  const llama_kv_cache & kv,
6363
6498
  struct ggml_cgraph * graph,
6364
6499
  struct ggml_tensor * wo,
@@ -6366,12 +6501,12 @@ static struct ggml_tensor * llm_build_kqv(
6366
6501
  struct ggml_tensor * q_cur,
6367
6502
  struct ggml_tensor * kq_mask,
6368
6503
  struct ggml_tensor * kq_pos,
6369
- int64_t n_ctx,
6370
6504
  int32_t n_tokens,
6371
6505
  int32_t n_kv,
6372
6506
  float kq_scale,
6373
6507
  const llm_build_cb & cb,
6374
6508
  int il) {
6509
+ const int64_t n_ctx = cparams.n_ctx;
6375
6510
  const int64_t n_head = hparams.n_head;
6376
6511
  const int64_t n_head_kv = hparams.n_head_kv;
6377
6512
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -6389,71 +6524,99 @@ static struct ggml_tensor * llm_build_kqv(
6389
6524
  0);
6390
6525
  cb(k, "k", il);
6391
6526
 
6392
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6393
- cb(kq, "kq", il);
6527
+ struct ggml_tensor * cur;
6528
+
6529
+ if (cparams.flash_attn) {
6530
+ GGML_UNUSED(model);
6531
+ GGML_UNUSED(n_ctx);
6394
6532
 
6395
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6396
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6397
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6398
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6399
- }
6533
+ // note: if this assert triggers, then some check has failed earlier
6534
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
+ GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6400
6536
 
6401
- if (model.arch == LLM_ARCH_GROK) {
6402
- // need to do the following:
6403
- // multiply by attn_output_multiplyer of 0.08838834764831845
6404
- // and then :
6405
- // kq = 30 * tanh(kq / 30)
6406
- // before the softmax below
6537
+ // split cached v into n_head heads (not transposed)
6538
+ struct ggml_tensor * v =
6539
+ ggml_view_3d(ctx, kv.v_l[il],
6540
+ n_embd_head_v, n_kv, n_head_kv,
6541
+ ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6543
+ 0);
6544
+ cb(v, "v", il);
6407
6545
 
6408
- //try from phi2
6409
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6546
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6410
6547
 
6411
- kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6412
- kq = ggml_scale(ctx, kq, 30);
6413
- }
6548
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6549
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
+ }
6551
+
6552
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6553
+ } else {
6554
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
+ cb(kq, "kq", il);
6556
+
6557
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6558
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6559
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6560
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6561
+ }
6562
+
6563
+ if (model.arch == LLM_ARCH_GROK) {
6564
+ // need to do the following:
6565
+ // multiply by attn_output_multiplyer of 0.08838834764831845
6566
+ // and then :
6567
+ // kq = 30 * tanh(kq / 30)
6568
+ // before the softmax below
6569
+
6570
+ //try from phi2
6571
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6572
+
6573
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6574
+ kq = ggml_scale(ctx, kq, 30);
6575
+ }
6414
6576
 
6415
6577
  #if defined(GGML_USE_KOMPUTE)
6416
6578
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6417
6579
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6418
6580
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6419
- if (hparams.f_max_alibi_bias > 0.0f) {
6420
- kq = ggml_scale(ctx, kq, kq_scale);
6421
- cb(kq, "kq_scaled", il);
6581
+ if (hparams.use_alibi) {
6582
+ kq = ggml_scale(ctx, kq, kq_scale);
6583
+ cb(kq, "kq_scaled", il);
6422
6584
 
6423
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6424
- cb(kq, "kq_scaled_alibi", il);
6585
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
+ cb(kq, "kq_scaled_alibi", il);
6425
6587
 
6426
- kq = ggml_add(ctx, kq, kq_mask);
6427
- cb(kq, "kq_masked", il);
6588
+ kq = ggml_add(ctx, kq, kq_mask);
6589
+ cb(kq, "kq_masked", il);
6428
6590
 
6429
- kq = ggml_soft_max(ctx, kq);
6430
- cb(kq, "kq_soft_max", il);
6431
- } else
6591
+ kq = ggml_soft_max(ctx, kq);
6592
+ cb(kq, "kq_soft_max", il);
6593
+ } else
6432
6594
  #endif
6433
- {
6434
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6435
- cb(kq, "kq_soft_max_ext", il);
6436
- }
6595
+ {
6596
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
+ cb(kq, "kq_soft_max_ext", il);
6598
+ }
6437
6599
 
6438
- GGML_ASSERT(kv.size == n_ctx);
6600
+ GGML_ASSERT(kv.size == n_ctx);
6439
6601
 
6440
- // split cached v into n_head heads
6441
- struct ggml_tensor * v =
6442
- ggml_view_3d(ctx, kv.v_l[il],
6443
- n_kv, n_embd_head_v, n_head_kv,
6444
- ggml_element_size(kv.v_l[il])*n_ctx,
6445
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6446
- 0);
6447
- cb(v, "v", il);
6602
+ // split cached v into n_head heads
6603
+ struct ggml_tensor * v =
6604
+ ggml_view_3d(ctx, kv.v_l[il],
6605
+ n_kv, n_embd_head_v, n_head_kv,
6606
+ ggml_element_size(kv.v_l[il])*n_ctx,
6607
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6608
+ 0);
6609
+ cb(v, "v", il);
6448
6610
 
6449
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6450
- cb(kqv, "kqv", il);
6611
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6612
+ cb(kqv, "kqv", il);
6451
6613
 
6452
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6453
- cb(kqv_merged, "kqv_merged", il);
6614
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
+ cb(kqv_merged, "kqv_merged", il);
6454
6616
 
6455
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6456
- cb(cur, "kqv_merged_cont", il);
6617
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6618
+ cb(cur, "kqv_merged_cont", il);
6619
+ }
6457
6620
 
6458
6621
  ggml_build_forward_expand(graph, cur);
6459
6622
 
@@ -6473,6 +6636,7 @@ static struct ggml_tensor * llm_build_kv(
6473
6636
  struct ggml_context * ctx,
6474
6637
  const llama_model & model,
6475
6638
  const llama_hparams & hparams,
6639
+ const llama_cparams & cparams,
6476
6640
  const llama_kv_cache & kv,
6477
6641
  struct ggml_cgraph * graph,
6478
6642
  struct ggml_tensor * wo,
@@ -6482,7 +6646,6 @@ static struct ggml_tensor * llm_build_kv(
6482
6646
  struct ggml_tensor * q_cur,
6483
6647
  struct ggml_tensor * kq_mask,
6484
6648
  struct ggml_tensor * kq_pos,
6485
- int64_t n_ctx,
6486
6649
  int32_t n_tokens,
6487
6650
  int32_t kv_head,
6488
6651
  int32_t n_kv,
@@ -6496,12 +6659,12 @@ static struct ggml_tensor * llm_build_kv(
6496
6659
  ggml_build_forward_expand(graph, k_cur);
6497
6660
  ggml_build_forward_expand(graph, v_cur);
6498
6661
 
6499
- llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
6662
+ llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
6500
6663
 
6501
6664
  struct ggml_tensor * cur;
6502
6665
 
6503
- cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
6504
- q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
6666
+ cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
+ q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6505
6668
  cb(cur, "kqv_out", il);
6506
6669
 
6507
6670
  return cur;
@@ -6543,6 +6706,8 @@ struct llm_build_context {
6543
6706
  const int32_t kv_head; // index of where we store new KV data in the cache
6544
6707
  const int32_t n_orig_ctx;
6545
6708
 
6709
+ const bool flash_attn;
6710
+
6546
6711
  const enum llama_pooling_type pooling_type;
6547
6712
  const enum llama_rope_type rope_type;
6548
6713
 
@@ -6589,6 +6754,7 @@ struct llm_build_context {
6589
6754
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6590
6755
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6591
6756
  n_orig_ctx (cparams.n_yarn_orig_ctx),
6757
+ flash_attn (cparams.flash_attn),
6592
6758
  pooling_type (cparams.pooling_type),
6593
6759
  rope_type (hparams.rope_type),
6594
6760
  cb (cb),
@@ -6703,15 +6869,31 @@ struct llm_build_context {
6703
6869
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6704
6870
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
6705
6871
 
6706
- ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6707
- nm, n_embd_v_gqa,
6708
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6709
- ggml_row_size(kv_self.v_l[il]->type, i));
6872
+ ggml_tensor * view_v_src;
6873
+ ggml_tensor * view_v_dst;
6874
+
6875
+ if (flash_attn) {
6876
+ // NOTE: the V cache is not transposed when using flash attention
6877
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6878
+ n_embd_v_gqa, nm,
6879
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6880
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
6710
6881
 
6711
- ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6712
- nm, n_embd_v_gqa,
6713
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6714
- ggml_row_size(kv_self.v_l[il]->type, id));
6882
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6883
+ n_embd_v_gqa, nm,
6884
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6885
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
6886
+ } else {
6887
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6888
+ nm, n_embd_v_gqa,
6889
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6890
+ ggml_row_size(kv_self.v_l[il]->type, i));
6891
+
6892
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6893
+ nm, n_embd_v_gqa,
6894
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6895
+ ggml_row_size(kv_self.v_l[il]->type, id));
6896
+ }
6715
6897
 
6716
6898
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
6717
6899
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
@@ -6741,20 +6923,26 @@ struct llm_build_context {
6741
6923
 
6742
6924
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
6743
6925
  if (causal) {
6744
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
6926
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6745
6927
  } else {
6746
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6928
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6747
6929
  }
6748
6930
  cb(lctx.inp_KQ_mask, "KQ_mask", -1);
6749
6931
  ggml_set_input(lctx.inp_KQ_mask);
6750
- return lctx.inp_KQ_mask;
6932
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6751
6933
  }
6752
6934
 
6753
- struct ggml_tensor * build_inp_KQ_pos() {
6754
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6935
+ struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
+ if (causal) {
6937
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
+ } else {
6939
+ // TODO: this will be needed for ALiBi-based BERT models
6940
+ // https://github.com/ggerganov/llama.cpp/pull/6826
6941
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
+ }
6755
6943
  cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6756
6944
  ggml_set_input(lctx.inp_KQ_pos);
6757
- return lctx.inp_KQ_pos;
6945
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6758
6946
  }
6759
6947
 
6760
6948
  struct ggml_tensor * build_inp_mean() {
@@ -6860,9 +7048,9 @@ struct llm_build_context {
6860
7048
  );
6861
7049
  cb(Kcur, "Kcur", il);
6862
7050
 
6863
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7051
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6864
7052
  model.layers[il].wo, model.layers[il].bo,
6865
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7053
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6866
7054
  }
6867
7055
 
6868
7056
  if (il == n_layer - 1) {
@@ -7000,9 +7188,9 @@ struct llm_build_context {
7000
7188
  cb(Qcur, "Qcur", il);
7001
7189
  cb(Kcur, "Kcur", il);
7002
7190
 
7003
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7191
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7004
7192
  model.layers[il].wo, NULL,
7005
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7193
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7006
7194
  }
7007
7195
 
7008
7196
  if (il == n_layer - 1) {
@@ -7107,9 +7295,9 @@ struct llm_build_context {
7107
7295
  ext_factor, attn_factor, beta_fast, beta_slow
7108
7296
  );
7109
7297
  cb(Kcur, "Kcur", il);
7110
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7298
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7111
7299
  model.layers[il].wo, NULL,
7112
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7300
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7113
7301
  }
7114
7302
 
7115
7303
  if (il == n_layer - 1) {
@@ -7227,9 +7415,9 @@ struct llm_build_context {
7227
7415
  );
7228
7416
  cb(Kcur, "Kcur", il);
7229
7417
 
7230
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7418
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7231
7419
  model.layers[il].wo, NULL,
7232
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7420
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7233
7421
  }
7234
7422
 
7235
7423
  if (il == n_layer - 1) {
@@ -7352,9 +7540,9 @@ struct llm_build_context {
7352
7540
  );
7353
7541
  cb(Kcur, "Kcur", il);
7354
7542
 
7355
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7543
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7356
7544
  model.layers[il].wo, model.layers[il].bo,
7357
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7545
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7358
7546
  }
7359
7547
 
7360
7548
  if (il == n_layer - 1) {
@@ -7504,9 +7692,9 @@ struct llm_build_context {
7504
7692
  );
7505
7693
  cb(Kcur, "Kcur", il);
7506
7694
 
7507
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
- model.layers[il].wo, NULL,
7509
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7695
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
+ model.layers[il].wo, NULL,
7697
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7510
7698
  }
7511
7699
 
7512
7700
  if (il == n_layer - 1) {
@@ -7616,9 +7804,9 @@ struct llm_build_context {
7616
7804
 
7617
7805
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7618
7806
 
7619
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7807
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7620
7808
  model.layers[il].wo, model.layers[il].bo,
7621
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7622
7810
  }
7623
7811
 
7624
7812
  if (il == n_layer - 1) {
@@ -7820,9 +8008,9 @@ struct llm_build_context {
7820
8008
  );
7821
8009
  cb(Vcur, "Vcur", il);
7822
8010
 
7823
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8011
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7824
8012
  model.layers[il].wo, model.layers[il].bo,
7825
- Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8013
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7826
8014
  }
7827
8015
 
7828
8016
  if (il == n_layer - 1) {
@@ -7916,9 +8104,9 @@ struct llm_build_context {
7916
8104
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7917
8105
  cb(Qcur, "Qcur", il);
7918
8106
 
7919
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8107
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7920
8108
  model.layers[il].wo, NULL,
7921
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8109
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7922
8110
  }
7923
8111
 
7924
8112
  if (il == n_layer - 1) {
@@ -8209,9 +8397,9 @@ struct llm_build_context {
8209
8397
 
8210
8398
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8211
8399
 
8212
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8400
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8213
8401
  model.layers[il].wo, model.layers[il].bo,
8214
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8402
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8215
8403
  }
8216
8404
 
8217
8405
  if (il == n_layer - 1) {
@@ -8340,14 +8528,15 @@ struct llm_build_context {
8340
8528
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8341
8529
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8342
8530
 
8343
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8344
- model.layers[il].wo, model.layers[il].bo,
8345
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8531
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
+ model.layers[il].wo, model.layers[il].bo,
8533
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8346
8534
  } else {
8347
8535
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8348
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8536
+
8537
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8349
8538
  model.layers[il].wo, model.layers[il].bo,
8350
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8539
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8351
8540
  }
8352
8541
  }
8353
8542
 
@@ -8489,9 +8678,9 @@ struct llm_build_context {
8489
8678
  );
8490
8679
  cb(Kcur, "Kcur", il);
8491
8680
 
8492
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8681
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8493
8682
  model.layers[il].wo, NULL,
8494
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8683
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8495
8684
  }
8496
8685
 
8497
8686
  if (il == n_layer - 1) {
@@ -8607,9 +8796,9 @@ struct llm_build_context {
8607
8796
  );
8608
8797
  cb(Kcur, "Kcur", il);
8609
8798
 
8610
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8799
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8611
8800
  model.layers[il].wo, NULL,
8612
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8801
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
8802
  }
8614
8803
 
8615
8804
  if (il == n_layer - 1) {
@@ -8720,9 +8909,9 @@ struct llm_build_context {
8720
8909
  );
8721
8910
  cb(Kcur, "Kcur", il);
8722
8911
 
8723
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8912
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8724
8913
  model.layers[il].wo, model.layers[il].bo,
8725
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8914
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8726
8915
  }
8727
8916
 
8728
8917
  if (il == n_layer - 1) {
@@ -8834,9 +9023,9 @@ struct llm_build_context {
8834
9023
  );
8835
9024
  cb(Kcur, "Kcur", il);
8836
9025
 
8837
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9026
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8838
9027
  model.layers[il].wo, model.layers[il].bo,
8839
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9028
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8840
9029
  }
8841
9030
 
8842
9031
  if (il == n_layer - 1) {
@@ -8989,9 +9178,9 @@ struct llm_build_context {
8989
9178
  );
8990
9179
  cb(Kcur, "Kcur", il);
8991
9180
 
8992
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9181
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8993
9182
  model.layers[il].wo, model.layers[il].bo,
8994
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9183
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8995
9184
  }
8996
9185
 
8997
9186
  if (il == n_layer - 1) {
@@ -9106,9 +9295,9 @@ struct llm_build_context {
9106
9295
  );
9107
9296
  cb(Kcur, "Kcur", il);
9108
9297
 
9109
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9110
- model.layers[il].wo, NULL,
9111
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9298
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
+ model.layers[il].wo, model.layers[il].bo,
9300
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9112
9301
  }
9113
9302
 
9114
9303
  if (il == n_layer - 1) {
@@ -9219,9 +9408,9 @@ struct llm_build_context {
9219
9408
  ext_factor, attn_factor, beta_fast, beta_slow);
9220
9409
  cb(Kcur, "Kcur", il);
9221
9410
 
9222
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9411
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9223
9412
  model.layers[il].wo, NULL,
9224
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9413
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9225
9414
  }
9226
9415
  struct ggml_tensor * sa_out = cur;
9227
9416
 
@@ -9322,9 +9511,9 @@ struct llm_build_context {
9322
9511
 
9323
9512
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9324
9513
 
9325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9514
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9326
9515
  model.layers[il].wo, model.layers[il].bo,
9327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9516
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9328
9517
  }
9329
9518
 
9330
9519
  if (il == n_layer - 1) {
@@ -9429,9 +9618,9 @@ struct llm_build_context {
9429
9618
  );
9430
9619
  cb(Kcur, "Kcur", il);
9431
9620
 
9432
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9621
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9433
9622
  model.layers[il].wo, model.layers[il].bo,
9434
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9623
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9435
9624
  }
9436
9625
 
9437
9626
  if (il == n_layer - 1) {
@@ -9545,9 +9734,9 @@ struct llm_build_context {
9545
9734
  );
9546
9735
  cb(Kcur, "Kcur", il);
9547
9736
 
9548
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9737
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9549
9738
  model.layers[il].wo, NULL,
9550
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9739
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9551
9740
  }
9552
9741
 
9553
9742
  if (il == n_layer - 1) {
@@ -9662,9 +9851,9 @@ struct llm_build_context {
9662
9851
  );
9663
9852
  cb(Kcur, "Kcur", il);
9664
9853
 
9665
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9854
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9666
9855
  model.layers[il].wo, model.layers[il].bo,
9667
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9856
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9668
9857
  }
9669
9858
 
9670
9859
  if (il == n_layer - 1) {
@@ -9792,9 +9981,9 @@ struct llm_build_context {
9792
9981
  );
9793
9982
  cb(Kcur, "Kcur", il);
9794
9983
 
9795
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9984
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9796
9985
  model.layers[il].wo, model.layers[il].bo,
9797
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9986
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9798
9987
  }
9799
9988
 
9800
9989
  if (il == n_layer - 1) {
@@ -9913,9 +10102,9 @@ struct llm_build_context {
9913
10102
  ext_factor, attn_factor, beta_fast, beta_slow);
9914
10103
  cb(Kcur, "Kcur", il);
9915
10104
 
9916
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10105
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9917
10106
  model.layers[il].wo, NULL,
9918
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10107
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9919
10108
  }
9920
10109
 
9921
10110
  if (il == n_layer - 1) {
@@ -10032,9 +10221,9 @@ struct llm_build_context {
10032
10221
  );
10033
10222
  cb(Kcur, "Kcur", il);
10034
10223
 
10035
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10224
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10036
10225
  model.layers[il].wo, model.layers[il].bo,
10037
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10226
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10038
10227
  }
10039
10228
 
10040
10229
  if (il == n_layer - 1) {
@@ -10322,9 +10511,9 @@ struct llm_build_context {
10322
10511
  );
10323
10512
  cb(Kcur, "Kcur", il);
10324
10513
 
10325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10514
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10326
10515
  model.layers[il].wo, model.layers[il].bo,
10327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10516
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10328
10517
  }
10329
10518
 
10330
10519
  if (il == n_layer - 1) {
@@ -10453,9 +10642,9 @@ struct llm_build_context {
10453
10642
  );
10454
10643
  cb(Kcur, "Kcur", il);
10455
10644
 
10456
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10645
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10457
10646
  model.layers[il].wo, nullptr,
10458
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10647
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10459
10648
  }
10460
10649
 
10461
10650
  if (il == n_layer - 1) {
@@ -10882,7 +11071,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
10882
11071
  }
10883
11072
  }
10884
11073
 
10885
- if (hparams.need_kq_pos) {
11074
+ // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
+ // this allows to process multiple sequences in parallel with ALiBi-based models
11076
+ if (hparams.use_alibi) {
10886
11077
  const int64_t n_kv = kv_self.n;
10887
11078
 
10888
11079
  GGML_ASSERT(lctx.inp_KQ_pos);
@@ -11264,7 +11455,7 @@ static int llama_decode_internal(
11264
11455
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11265
11456
  // after enough generations, the benefit from this heuristic disappears
11266
11457
  // if we start defragmenting the cache, the benefit from this will be more important
11267
- kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
11458
+ kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11268
11459
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11269
11460
  }
11270
11461
  }
@@ -11432,6 +11623,10 @@ static int llama_decode_internal(
11432
11623
  }
11433
11624
  }
11434
11625
 
11626
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
11627
+ // overlap with device computation.
11628
+ ggml_backend_sched_reset(lctx.sched);
11629
+
11435
11630
  return 0;
11436
11631
  }
11437
11632
 
@@ -11457,7 +11652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
11457
11652
  // each move requires 6*n_layer tensors (see build_defrag)
11458
11653
  // - source view, destination view, copy operation
11459
11654
  // - x2 for keys and values
11460
- const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11655
+ //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11656
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11657
+ const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
11461
11658
 
11462
11659
  // determine which KV cells to move where
11463
11660
  //
@@ -11773,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
11773
11970
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11774
11971
  GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
11775
11972
  GGML_ASSERT(llama_is_byte_token(vocab, id));
11776
- const auto& token_data = vocab.id_to_token.at(id);
11973
+ const auto & token_data = vocab.id_to_token.at(id);
11777
11974
  switch (llama_vocab_get_type(vocab)) {
11778
11975
  case LLAMA_VOCAB_TYPE_SPM: {
11779
11976
  auto buf = token_data.text.substr(3, 2);
@@ -11781,7 +11978,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11781
11978
  }
11782
11979
  case LLAMA_VOCAB_TYPE_BPE: {
11783
11980
  GGML_ASSERT(false);
11784
- return unicode_utf8_to_byte(token_data.text);
11981
+ return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
11785
11982
  }
11786
11983
  case LLAMA_VOCAB_TYPE_WPM: {
11787
11984
  GGML_ASSERT(false);
@@ -12003,7 +12200,94 @@ struct llm_tokenizer_bpe {
12003
12200
 
12004
12201
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12005
12202
  int final_prev_index = -1;
12006
- auto word_collection = bpe_gpt2_preprocess(text);
12203
+
12204
+ std::vector<std::string> word_collection;
12205
+ switch (vocab.type) {
12206
+ case LLAMA_VOCAB_TYPE_BPE:
12207
+ switch (vocab.type_pre) {
12208
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12210
+ word_collection = unicode_regex_split(text, {
12211
+ // original regex from tokenizer.json
12212
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12213
+
12214
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12215
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
+ });
12217
+ break;
12218
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
+ word_collection = unicode_regex_split(text, {
12220
+ "[\r\n]",
12221
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12222
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12223
+ "\\s+$",
12224
+ "[一-龥ࠀ-一가-퟿]+",
12225
+ "\\p{N}+",
12226
+ });
12227
+ break;
12228
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12229
+ word_collection = unicode_regex_split(text, {
12230
+ "[\r\n]",
12231
+ "\\s?\\p{L}+",
12232
+ "\\s?\\p{P}+",
12233
+ "[一-龥ࠀ-一가-퟿]+",
12234
+ "\\p{N}",
12235
+ });
12236
+ break;
12237
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
12238
+ word_collection = unicode_regex_split(text, {
12239
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12240
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
+ "[0-9][0-9][0-9]",
12242
+ });
12243
+ break;
12244
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
12245
+ // TODO: MPT pre-tokenization regexes are unknown
12246
+ // the following are close, but not exact. run the following:
12247
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12248
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12249
+ word_collection = unicode_regex_split(text, {
12250
+ "\\s?\\p{L}+",
12251
+ "\\s?\\p{P}+",
12252
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12253
+ });
12254
+ break;
12255
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12256
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
12257
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
12258
+ word_collection = unicode_regex_split(text, {
12259
+ "\\p{N}",
12260
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12261
+ });
12262
+ break;
12263
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
12264
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
12265
+ word_collection = unicode_regex_split(text, {
12266
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12267
+ });
12268
+ break;
12269
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270
+ word_collection = unicode_regex_split(text, {
12271
+ // original regex from tokenizer.json
12272
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
12273
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12274
+ });
12275
+ break;
12276
+ default:
12277
+ // default regex for BPE tokenization pre-processing
12278
+ word_collection = unicode_regex_split(text, {
12279
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12280
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12281
+ "\\p{N}+",
12282
+ "[0-9][0-9][0-9]",
12283
+ });
12284
+ break;
12285
+ }
12286
+ break;
12287
+ default:
12288
+ GGML_ASSERT(false);
12289
+ break;
12290
+ }
12007
12291
 
12008
12292
  symbols_final.clear();
12009
12293
 
@@ -12130,145 +12414,6 @@ private:
12130
12414
  work_queue.push(bigram);
12131
12415
  }
12132
12416
 
12133
- std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
12134
- std::vector<std::string> bpe_words;
12135
- std::vector<std::string> bpe_encoded_words;
12136
-
12137
- std::string token = "";
12138
- // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
12139
- bool collecting_numeric = false;
12140
- bool collecting_letter = false;
12141
- bool collecting_special = false;
12142
- bool collecting_whitespace_lookahead = false;
12143
- bool collecting = false;
12144
-
12145
- std::vector<std::string> text_utf;
12146
- text_utf.reserve(text.size());
12147
- bpe_words.reserve(text.size());
12148
- bpe_encoded_words.reserve(text.size());
12149
-
12150
- const auto cpts = unicode_cpts_from_utf8(text);
12151
- for (size_t i = 0; i < cpts.size(); ++i)
12152
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
12153
-
12154
- for (int i = 0; i < (int)text_utf.size(); i++) {
12155
- const std::string & utf_char = text_utf[i];
12156
- bool split_condition = false;
12157
- int bytes_remain = text_utf.size() - i;
12158
- // forward backward lookups
12159
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
12160
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
12161
-
12162
- // handling contractions
12163
- if (!split_condition && bytes_remain >= 2) {
12164
- // 's|'t|'m|'d
12165
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
12166
- split_condition = true;
12167
- }
12168
- if (split_condition) {
12169
- if (token.size()) {
12170
- bpe_words.emplace_back(token); // push previous content as token
12171
- }
12172
- token = utf_char + utf_char_next;
12173
- bpe_words.emplace_back(token);
12174
- token = "";
12175
- i++;
12176
- continue;
12177
- }
12178
- }
12179
- if (!split_condition && bytes_remain >= 3) {
12180
- // 're|'ve|'ll
12181
- if (utf_char == "\'" && (
12182
- (utf_char_next == "r" && utf_char_next_next == "e") ||
12183
- (utf_char_next == "v" && utf_char_next_next == "e") ||
12184
- (utf_char_next == "l" && utf_char_next_next == "l"))
12185
- ) {
12186
- split_condition = true;
12187
- }
12188
- if (split_condition) {
12189
- // current token + next token can be defined
12190
- if (token.size()) {
12191
- bpe_words.emplace_back(token); // push previous content as token
12192
- }
12193
- token = utf_char + utf_char_next + utf_char_next_next;
12194
- bpe_words.emplace_back(token); // the contraction
12195
- token = "";
12196
- i += 2;
12197
- continue;
12198
- }
12199
- }
12200
-
12201
- if (!split_condition && !collecting) {
12202
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
12203
- collecting_letter = true;
12204
- collecting = true;
12205
- }
12206
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12207
- collecting_numeric = true;
12208
- collecting = true;
12209
- }
12210
- else if (
12211
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
12212
- (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
12213
- ) {
12214
- collecting_special = true;
12215
- collecting = true;
12216
- }
12217
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
12218
- collecting_whitespace_lookahead = true;
12219
- collecting = true;
12220
- }
12221
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
12222
- split_condition = true;
12223
- }
12224
- }
12225
- else if (!split_condition && collecting) {
12226
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
12227
- split_condition = true;
12228
- }
12229
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
12230
- split_condition = true;
12231
- }
12232
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
12233
- split_condition = true;
12234
- }
12235
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12236
- split_condition = true;
12237
- }
12238
- }
12239
-
12240
- if (utf_char_next == "") {
12241
- split_condition = true; // final
12242
- token += utf_char;
12243
- }
12244
-
12245
- if (split_condition) {
12246
- if (token.size()) {
12247
- bpe_words.emplace_back(token);
12248
- }
12249
- token = utf_char;
12250
- collecting = false;
12251
- collecting_letter = false;
12252
- collecting_numeric = false;
12253
- collecting_special = false;
12254
- collecting_whitespace_lookahead = false;
12255
- }
12256
- else {
12257
- token += utf_char;
12258
- }
12259
- }
12260
-
12261
- for (std::string & word : bpe_words) {
12262
- std::string encoded_token = "";
12263
- for (char & c : word) {
12264
- encoded_token += unicode_byte_to_utf8(c);
12265
- }
12266
- bpe_encoded_words.emplace_back(encoded_token);
12267
- }
12268
-
12269
- return bpe_encoded_words;
12270
- }
12271
-
12272
12417
  const llama_vocab & vocab;
12273
12418
 
12274
12419
  std::vector<llm_symbol> symbols;
@@ -12343,7 +12488,7 @@ struct llm_tokenizer_wpm {
12343
12488
  continue;
12344
12489
  }
12345
12490
  code = unicode_tolower(code);
12346
- if (type == CODEPOINT_TYPE_WHITESPACE) {
12491
+ if (type == CODEPOINT_TYPE_SEPARATOR) {
12347
12492
  code = ' ';
12348
12493
  }
12349
12494
  std::string s = unicode_cpt_to_utf8(code);
@@ -12588,7 +12733,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12588
12733
  } break;
12589
12734
  case LLAMA_VOCAB_TYPE_BPE:
12590
12735
  {
12591
- if (add_special && vocab.special_add_bos == 1) {
12736
+ if (add_special && vocab.special_add_bos != 0) {
12592
12737
  GGML_ASSERT(vocab.special_bos_id != -1);
12593
12738
  output.push_back(vocab.special_bos_id);
12594
12739
  }
@@ -14030,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
14030
14175
  if (qtype.to_float == NULL) {
14031
14176
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
14032
14177
  }
14033
- } else if (tensor->type != GGML_TYPE_F16) {
14178
+ } else if (tensor->type != GGML_TYPE_F16 &&
14179
+ tensor->type != GGML_TYPE_BF16) {
14034
14180
  throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
14035
14181
  }
14036
14182
 
14037
14183
  if (nthread < 2) {
14038
14184
  if (tensor->type == GGML_TYPE_F16) {
14039
14185
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
14186
+ } else if (tensor->type == GGML_TYPE_BF16) {
14187
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
14040
14188
  } else if (ggml_is_quantized(tensor->type)) {
14041
14189
  qtype.to_float(tensor->data, f32_output, nelements);
14042
14190
  } else {
@@ -14045,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
14045
14193
  return;
14046
14194
  }
14047
14195
 
14048
- size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
14196
+ size_t block_size;
14197
+ if (tensor->type == GGML_TYPE_F16 ||
14198
+ tensor->type == GGML_TYPE_BF16) {
14199
+ block_size = 1;
14200
+ } else {
14201
+ block_size = (size_t)ggml_blck_size(tensor->type);
14202
+ }
14203
+
14049
14204
  size_t block_size_bytes = ggml_type_size(tensor->type);
14050
14205
 
14051
14206
  GGML_ASSERT(nelements % block_size == 0);
@@ -14064,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
14064
14219
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
14065
14220
  if (typ == GGML_TYPE_F16) {
14066
14221
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
14222
+ } else if (typ == GGML_TYPE_BF16) {
14223
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
14067
14224
  } else {
14068
14225
  qtype.to_float(inbuf, outbuf, nels);
14069
14226
  }
@@ -14360,14 +14517,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14360
14517
  }
14361
14518
 
14362
14519
  static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14363
- std::mutex mutex;
14364
- int64_t counter = 0;
14365
- size_t new_size = 0;
14366
14520
  if (nthread < 2) {
14367
14521
  // single-thread
14368
- return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14522
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14523
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
14524
+ throw std::runtime_error("quantized data validation failed");
14525
+ }
14526
+ return new_size;
14369
14527
  }
14370
- auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
14528
+
14529
+ std::mutex mutex;
14530
+ int64_t counter = 0;
14531
+ size_t new_size = 0;
14532
+ bool valid = true;
14533
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
14371
14534
  nrows, n_per_row, imatrix]() {
14372
14535
  const int64_t nrows_per_chunk = chunk_size / n_per_row;
14373
14536
  size_t local_size = 0;
@@ -14382,7 +14545,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14382
14545
  }
14383
14546
  lock.unlock();
14384
14547
  const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14385
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14548
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14549
+ local_size += this_size;
14550
+
14551
+ // validate the quantized data
14552
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
14553
+ void * this_data = (char *) new_data + first_row * row_size;
14554
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
14555
+ std::unique_lock<std::mutex> lock(mutex);
14556
+ valid = false;
14557
+ break;
14558
+ }
14386
14559
  }
14387
14560
  };
14388
14561
  for (int it = 0; it < nthread - 1; ++it) {
@@ -14391,6 +14564,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14391
14564
  compute();
14392
14565
  for (auto & w : workers) { w.join(); }
14393
14566
  workers.clear();
14567
+ if (!valid) {
14568
+ throw std::runtime_error("quantized data validation failed");
14569
+ }
14394
14570
  return new_size;
14395
14571
  }
14396
14572
 
@@ -14405,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14405
14581
  case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
14406
14582
  case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
14407
14583
  case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
14584
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
14408
14585
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
14409
14586
 
14410
14587
  // K-quants
@@ -14453,7 +14630,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14453
14630
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
14454
14631
  kv_overrides = v->data();
14455
14632
  }
14456
- llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
14633
+ llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
14457
14634
  ml.init_mappings(false); // no prefetching
14458
14635
 
14459
14636
  llama_model model;
@@ -14491,11 +14668,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14491
14668
  for (auto & o : overrides) {
14492
14669
  if (o.key[0] == 0) break;
14493
14670
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
14494
- gguf_set_val_f32(ctx_out, o.key, o.float_value);
14671
+ gguf_set_val_f32(ctx_out, o.key, o.val_f64);
14495
14672
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
14496
- gguf_set_val_i32(ctx_out, o.key, o.int_value);
14673
+ gguf_set_val_i32(ctx_out, o.key, o.val_i64);
14497
14674
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
14498
- gguf_set_val_bool(ctx_out, o.key, o.bool_value);
14675
+ gguf_set_val_bool(ctx_out, o.key, o.val_bool);
14676
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
14677
+ gguf_set_val_str(ctx_out, o.key, o.val_str);
14499
14678
  } else {
14500
14679
  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
14501
14680
  }
@@ -14814,7 +14993,7 @@ static int llama_apply_lora_from_file_internal(
14814
14993
  std::unique_ptr<llama_model_loader> ml;
14815
14994
  if (path_base_model) {
14816
14995
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
14817
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
14996
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
14818
14997
  ml->init_mappings(/*prefetch*/ false); // no prefetching
14819
14998
  }
14820
14999
 
@@ -15073,6 +15252,7 @@ struct llama_model_params llama_model_default_params() {
15073
15252
  /*.vocab_only =*/ false,
15074
15253
  /*.use_mmap =*/ true,
15075
15254
  /*.use_mlock =*/ false,
15255
+ /*.check_tensors =*/ false,
15076
15256
  };
15077
15257
 
15078
15258
  #ifdef GGML_USE_METAL
@@ -15109,6 +15289,7 @@ struct llama_context_params llama_context_default_params() {
15109
15289
  /*.logits_all =*/ false,
15110
15290
  /*.embeddings =*/ false,
15111
15291
  /*.offload_kqv =*/ true,
15292
+ /*.flash_attn =*/ false,
15112
15293
  /*.abort_callback =*/ nullptr,
15113
15294
  /*.abort_callback_data =*/ nullptr,
15114
15295
  };
@@ -15275,6 +15456,7 @@ struct llama_context * llama_new_context_with_model(
15275
15456
  cparams.defrag_thold = params.defrag_thold;
15276
15457
  cparams.embeddings = params.embeddings;
15277
15458
  cparams.offload_kqv = params.offload_kqv;
15459
+ cparams.flash_attn = params.flash_attn;
15278
15460
  cparams.pooling_type = params.pooling_type;
15279
15461
 
15280
15462
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -15282,12 +15464,20 @@ struct llama_context * llama_new_context_with_model(
15282
15464
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15283
15465
 
15284
15466
  // this is necessary due to kv_self.n being padded later during inference
15285
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
15467
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15286
15468
 
15287
15469
  // with causal attention, the batch size is limited by the context size
15288
15470
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
15289
- cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15290
15471
 
15472
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
15473
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
15474
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
15475
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
15476
+ LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
15477
+ cparams.n_batch = GGML_KQ_MASK_PAD;
15478
+ }
15479
+
15480
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15291
15481
 
15292
15482
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15293
15483
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
@@ -15319,6 +15509,16 @@ struct llama_context * llama_new_context_with_model(
15319
15509
  }
15320
15510
  }
15321
15511
 
15512
+ if (cparams.flash_attn && hparams.use_alibi) {
15513
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
+ cparams.flash_attn = false;
15515
+ }
15516
+
15517
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
+ cparams.flash_attn = false;
15520
+ }
15521
+
15322
15522
  if (params.seed == LLAMA_DEFAULT_SEED) {
15323
15523
  params.seed = time(NULL);
15324
15524
  }
@@ -15326,6 +15526,7 @@ struct llama_context * llama_new_context_with_model(
15326
15526
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
15327
15527
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
15328
15528
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
15529
+ LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
15329
15530
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
15330
15531
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
15331
15532
 
@@ -15454,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
15454
15655
  }
15455
15656
  ctx->backends.push_back(ctx->backend_cpu);
15456
15657
 
15457
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
15658
+ if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
15458
15659
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
15459
15660
  llama_free(ctx);
15460
15661
  return nullptr;
@@ -16053,6 +16254,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16053
16254
  const size_t s_kv_head = sizeof(uint32_t);
16054
16255
  const size_t s_kv_size = sizeof(uint32_t);
16055
16256
  const size_t s_kv_used = sizeof(uint32_t);
16257
+ const size_t s_v_trans = sizeof(uint32_t);
16056
16258
  const size_t s_kv = ctx->kv_self.total_size();
16057
16259
  const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
16058
16260
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
@@ -16070,10 +16272,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
16070
16272
  + s_kv_head
16071
16273
  + s_kv_size
16072
16274
  + s_kv_used
16275
+ + s_v_trans
16073
16276
  + s_kv
16074
16277
  + s_kv_cells
16075
16278
  );
16076
16279
 
16280
+ // on session change it is very likely that the state size has changed - so we need to update this function
16281
+ static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
16282
+
16077
16283
  return s_total;
16078
16284
  }
16079
16285
 
@@ -16219,11 +16425,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16219
16425
  const uint32_t kv_size = kv_self.size;
16220
16426
  const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
16221
16427
  const uint32_t kv_used = kv_self.used;
16428
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
16222
16429
 
16223
16430
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
16224
16431
  data_ctx->write(&kv_head, sizeof(kv_head));
16225
16432
  data_ctx->write(&kv_size, sizeof(kv_size));
16226
16433
  data_ctx->write(&kv_used, sizeof(kv_used));
16434
+ data_ctx->write(&v_trans, sizeof(v_trans));
16227
16435
 
16228
16436
  if (kv_buf_size) {
16229
16437
  const size_t pre_kv_buf_size = data_ctx->get_size_written();
@@ -16236,7 +16444,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
16236
16444
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
16237
16445
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
16238
16446
 
16239
- if (kv_self.recurrent) {
16447
+ if (kv_self.recurrent || !kv_self.v_trans) {
16240
16448
  // v is contiguous for recurrent models
16241
16449
  // TODO: use other tensors for state models than k and v
16242
16450
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16369,11 +16577,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16369
16577
  uint32_t kv_head;
16370
16578
  uint32_t kv_size;
16371
16579
  uint32_t kv_used;
16580
+ uint32_t v_trans;
16372
16581
 
16373
16582
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
16374
16583
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
16375
16584
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
16376
16585
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
16586
+ memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
16587
+
16588
+ GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
16377
16589
 
16378
16590
  if (kv_self.size != kv_size) {
16379
16591
  // the KV cache needs to be big enough to load all the KV cells from the saved state
@@ -16383,6 +16595,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16383
16595
  __func__, kv_head, kv_size, kv_self.size);
16384
16596
  }
16385
16597
 
16598
+ llama_kv_cache_clear(ctx);
16599
+
16386
16600
  if (kv_buf_size) {
16387
16601
  const size_t pre_kv_buf_size = inp - src;
16388
16602
 
@@ -16394,7 +16608,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16394
16608
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
16395
16609
  inp += k_size;
16396
16610
 
16397
- if (kv_self.recurrent) {
16611
+ if (kv_self.recurrent || !kv_self.v_trans) {
16398
16612
  // v is contiguous for recurrent models
16399
16613
  // TODO: use other tensors for state models than k and v
16400
16614
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16416,8 +16630,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16416
16630
  GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
16417
16631
  }
16418
16632
 
16419
- llama_kv_cache_clear(ctx);
16420
-
16421
16633
  ctx->kv_self.head = kv_head;
16422
16634
  ctx->kv_self.used = kv_used;
16423
16635
 
@@ -16677,28 +16889,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16677
16889
  }
16678
16890
  }
16679
16891
 
16680
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16681
- const uint32_t kv_size = kv_self.size;
16682
- for (int il = 0; il < (int)n_layer; ++il) {
16683
- // Write value type
16684
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16685
- data_ctx.write(&v_type_i, sizeof(v_type_i));
16892
+ // TODO: simplify, reduce copy-paste
16893
+ if (!kv_self.v_trans) {
16894
+ for (int il = 0; il < (int)n_layer; ++il) {
16895
+ // Write value type
16896
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16897
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16686
16898
 
16687
- // Write element size
16688
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16689
- data_ctx.write(&v_size_el, sizeof(v_size_el));
16899
+ // Write row size of value
16900
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
16901
+ data_ctx.write(&v_size_row, sizeof(v_size_row));
16690
16902
 
16691
- // For each row, we get the element values of each cell
16692
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16693
- // Read each range of cells of v_size_el length each into tmp_buf and write out
16903
+ // Read each range of cells of v_size length each into tmp_buf and write out
16694
16904
  for (const auto & range : cell_ranges) {
16695
16905
  const size_t range_size = range.second - range.first;
16696
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16697
- tmp_buf.resize(range_size * v_size_el);
16698
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16906
+ tmp_buf.resize(range_size * v_size_row);
16907
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
16699
16908
  data_ctx.write(tmp_buf.data(), tmp_buf.size());
16700
16909
  }
16701
16910
  }
16911
+ } else {
16912
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16913
+ const uint32_t kv_size = kv_self.size;
16914
+ for (int il = 0; il < (int)n_layer; ++il) {
16915
+ // Write value type
16916
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16917
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16918
+
16919
+ // Write element size
16920
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16921
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16922
+
16923
+ // For each row, we get the element values of each cell
16924
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16925
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16926
+ for (const auto & range : cell_ranges) {
16927
+ const size_t range_size = range.second - range.first;
16928
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16929
+ tmp_buf.resize(range_size * v_size_el);
16930
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16931
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16932
+ }
16933
+ }
16934
+ }
16702
16935
  }
16703
16936
 
16704
16937
  return data_ctx.get_size_written();
@@ -16823,41 +17056,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
16823
17056
  }
16824
17057
  }
16825
17058
 
16826
- // For each layer, read the values for each cell (transposed)
16827
- for (int il = 0; il < (int)n_layer; ++il) {
16828
- // Read type of value
16829
- int32_t v_type_i_ref;
16830
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16831
- inp += sizeof(v_type_i_ref);
16832
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16833
- if (v_type_i != v_type_i_ref) {
16834
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16835
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16836
- return 0;
16837
- }
17059
+ // TODO: simplify, reduce copy-paste
17060
+ if (!kv_self.v_trans) {
17061
+ for (int il = 0; il < (int)n_layer; ++il) {
17062
+ // Read type of value
17063
+ int32_t v_type_i_ref;
17064
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17065
+ inp += sizeof(v_type_i_ref);
17066
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17067
+ if (v_type_i != v_type_i_ref) {
17068
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17069
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17070
+ return 0;
17071
+ }
16838
17072
 
16839
- // Read element size of value
16840
- size_t v_size_el_ref;
16841
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16842
- inp += sizeof(v_size_el_ref);
16843
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16844
- if (v_size_el != v_size_el_ref) {
16845
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16846
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16847
- return 0;
16848
- }
17073
+ // Read row size of value
17074
+ size_t v_size_row_ref;
17075
+ memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
17076
+ inp += sizeof(v_size_row_ref);
17077
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
17078
+ if (v_size_row != v_size_row_ref) {
17079
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17080
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
17081
+ return 0;
17082
+ }
16849
17083
 
16850
- if (cell_count) {
16851
- // For each row in the transposed matrix, read the values for the whole cell range
16852
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16853
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16854
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16855
- inp += cell_count * v_size_el;
17084
+ if (cell_count) {
17085
+ // Read and set the values for the whole cell range
17086
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
17087
+ inp += cell_count * v_size_row;
17088
+ }
17089
+ }
17090
+ } else {
17091
+ // For each layer, read the values for each cell (transposed)
17092
+ for (int il = 0; il < (int)n_layer; ++il) {
17093
+ // Read type of value
17094
+ int32_t v_type_i_ref;
17095
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17096
+ inp += sizeof(v_type_i_ref);
17097
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17098
+ if (v_type_i != v_type_i_ref) {
17099
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17100
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17101
+ return 0;
17102
+ }
17103
+
17104
+ // Read element size of value
17105
+ size_t v_size_el_ref;
17106
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
17107
+ inp += sizeof(v_size_el_ref);
17108
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
17109
+ if (v_size_el != v_size_el_ref) {
17110
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17111
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
17112
+ return 0;
17113
+ }
17114
+
17115
+ if (cell_count) {
17116
+ // For each row in the transposed matrix, read the values for the whole cell range
17117
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
17118
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
17119
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
17120
+ inp += cell_count * v_size_el;
17121
+ }
16856
17122
  }
16857
17123
  }
16858
17124
  }
16859
17125
 
16860
17126
  const size_t nread = inp - src;
17127
+
16861
17128
  return nread;
16862
17129
  }
16863
17130
 
@@ -17238,9 +17505,10 @@ int32_t llama_tokenize(
17238
17505
 
17239
17506
  static std::string llama_decode_text(const std::string & text) {
17240
17507
  std::string decoded_text;
17241
- auto unicode_sequences = unicode_cpts_from_utf8(text);
17242
- for (auto & unicode_sequence : unicode_sequences) {
17243
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17508
+
17509
+ const auto cpts = unicode_cpts_from_utf8(text);
17510
+ for (const auto cpt : cpts) {
17511
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
17244
17512
  }
17245
17513
 
17246
17514
  return decoded_text;
@@ -17604,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
17604
17872
  /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
17605
17873
 
17606
17874
  /*.n_sample =*/ std::max(1, ctx->n_sample),
17607
- /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
17875
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
17608
17876
  /*.n_eval =*/ std::max(1, ctx->n_eval),
17609
17877
  };
17610
17878
 
@@ -17654,9 +17922,9 @@ const char * llama_print_system_info(void) {
17654
17922
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17655
17923
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
17924
  #ifdef GGML_USE_LLAMAFILE
17657
- s += "LAMMAFILE = 1 | ";
17925
+ s += "LLAMAFILE = 1 | ";
17658
17926
  #else
17659
- s += "LAMMAFILE = 0 | ";
17927
+ s += "LLAMAFILE = 0 | ";
17660
17928
  #endif
17661
17929
 
17662
17930
  return s.c_str();