llama_cpp 0.14.6 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -75,6 +75,7 @@
75
75
  #include <forward_list>
76
76
  #include <fstream>
77
77
  #include <functional>
78
+ #include <future>
78
79
  #include <initializer_list>
79
80
  #include <locale>
80
81
  #include <map>
@@ -107,7 +108,6 @@
107
108
  #define LLAMA_MAX_NODES 8192
108
109
  #define LLAMA_MAX_EXPERTS 60
109
110
 
110
-
111
111
  //
112
112
  // logging
113
113
  //
@@ -211,6 +211,7 @@ enum llm_arch {
211
211
  LLM_ARCH_QWEN2,
212
212
  LLM_ARCH_QWEN2MOE,
213
213
  LLM_ARCH_PHI2,
214
+ LLM_ARCH_PHI3,
214
215
  LLM_ARCH_PLAMO,
215
216
  LLM_ARCH_CODESHELL,
216
217
  LLM_ARCH_ORION,
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
246
247
  { LLM_ARCH_QWEN2, "qwen2" },
247
248
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
248
249
  { LLM_ARCH_PHI2, "phi2" },
250
+ { LLM_ARCH_PHI3, "phi3" },
249
251
  { LLM_ARCH_PLAMO, "plamo" },
250
252
  { LLM_ARCH_CODESHELL, "codeshell" },
251
253
  { LLM_ARCH_ORION, "orion" },
@@ -314,6 +316,7 @@ enum llm_kv {
314
316
  LLM_KV_SSM_TIME_STEP_RANK,
315
317
 
316
318
  LLM_KV_TOKENIZER_MODEL,
319
+ LLM_KV_TOKENIZER_PRE,
317
320
  LLM_KV_TOKENIZER_LIST,
318
321
  LLM_KV_TOKENIZER_TOKEN_TYPE,
319
322
  LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@@ -390,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
390
393
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
391
394
 
392
395
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
396
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
393
397
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
394
398
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
395
399
  { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@@ -793,6 +797,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
793
797
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
794
798
  },
795
799
  },
800
+ {
801
+ LLM_ARCH_PHI3,
802
+ {
803
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
804
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
805
+ { LLM_TENSOR_OUTPUT, "output" },
806
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
807
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
808
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
809
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
810
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
811
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
812
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
813
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
814
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
815
+ },
816
+ },
796
817
  {
797
818
  LLM_ARCH_PLAMO,
798
819
  {
@@ -1600,12 +1621,12 @@ struct llama_mlock {
1600
1621
  };
1601
1622
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1602
1623
 
1603
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1624
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1604
1625
  std::vector<char> result(8, 0);
1605
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1626
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1606
1627
  if (n_tokens < 0) {
1607
1628
  result.resize(-n_tokens);
1608
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1629
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1609
1630
  GGML_ASSERT(check == -n_tokens);
1610
1631
  }
1611
1632
  else {
@@ -1824,7 +1845,7 @@ struct llama_hparams {
1824
1845
  float f_logit_scale = 0.0f;
1825
1846
 
1826
1847
  bool causal_attn = true;
1827
- bool need_kq_pos = false;
1848
+ bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1828
1849
 
1829
1850
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1830
1851
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1914,6 +1935,7 @@ struct llama_cparams {
1914
1935
  bool embeddings;
1915
1936
  bool causal_attn;
1916
1937
  bool offload_kqv;
1938
+ bool flash_attn;
1917
1939
 
1918
1940
  enum llama_pooling_type pooling_type;
1919
1941
 
@@ -2017,8 +2039,8 @@ struct llama_kv_cache {
2017
2039
  bool has_shift = false;
2018
2040
  bool do_defrag = false;
2019
2041
  bool do_copy = false;
2020
- // with recurrent state models, a cell can hold the state for more than one past token
2021
- bool recurrent = false;
2042
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
2043
+ bool v_trans = true; // the value tensor is transposed
2022
2044
 
2023
2045
  // Note: The value of head isn't only used to optimize searching
2024
2046
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2095,7 +2117,8 @@ struct llama_vocab {
2095
2117
  ttype type;
2096
2118
  };
2097
2119
 
2098
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2120
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2121
+ enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2099
2122
 
2100
2123
  std::unordered_map<token, id> token_to_id;
2101
2124
  std::vector<token_data> id_to_token;
@@ -2120,7 +2143,7 @@ struct llama_vocab {
2120
2143
  id special_prefix_id = -1;
2121
2144
  id special_suffix_id = -1;
2122
2145
  id special_middle_id = -1;
2123
- id special_eot_id = -1;
2146
+ id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2124
2147
 
2125
2148
  bool add_space_prefix = true;
2126
2149
 
@@ -2316,11 +2339,14 @@ struct llama_context {
2316
2339
 
2317
2340
  static bool llama_kv_cache_init(
2318
2341
  struct llama_kv_cache & cache,
2319
- const llama_model & model,
2342
+ const llama_context * ctx,
2320
2343
  ggml_type type_k,
2321
2344
  ggml_type type_v,
2322
2345
  uint32_t kv_size,
2323
2346
  bool offload) {
2347
+ const llama_model & model = ctx->model;
2348
+ const llama_cparams & cparams = ctx->cparams;
2349
+
2324
2350
  const struct llama_hparams & hparams = model.hparams;
2325
2351
 
2326
2352
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
@@ -2331,8 +2357,9 @@ static bool llama_kv_cache_init(
2331
2357
 
2332
2358
  // TODO: find a nicer way to add other recurrent model architectures
2333
2359
  cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2360
+ cache.v_trans = !cparams.flash_attn;
2334
2361
 
2335
- // TODO: support mixed reccurent Transformer architectues
2362
+ // TODO: support mixed recurrent Transformer architectures
2336
2363
  // NOTE: (!a || b) is a logical implication (a -> b)
2337
2364
  GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2338
2365
  GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@@ -2543,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2543
2570
  }
2544
2571
  cache.head = 0;
2545
2572
  cache.used = 0;
2573
+
2574
+ for (auto & buf : cache.bufs) {
2575
+ ggml_backend_buffer_clear(buf, 0);
2576
+ }
2546
2577
  }
2547
2578
 
2548
2579
  static bool llama_kv_cache_seq_rm(
@@ -2863,6 +2894,7 @@ namespace GGUFMeta {
2863
2894
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2864
2895
  case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2865
2896
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2897
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
2866
2898
  }
2867
2899
  return "unknown";
2868
2900
  }
@@ -2874,13 +2906,16 @@ namespace GGUFMeta {
2874
2906
  __func__, override_type_to_str(ovrd->tag), ovrd->key);
2875
2907
  switch (ovrd->tag) {
2876
2908
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2877
- LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2909
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
2878
2910
  } break;
2879
2911
  case LLAMA_KV_OVERRIDE_TYPE_INT: {
2880
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2912
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
2881
2913
  } break;
2882
2914
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2883
- LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2915
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
2916
+ } break;
2917
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
2918
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
2884
2919
  } break;
2885
2920
  default:
2886
2921
  // Shouldn't be possible to end up here, but just in case...
@@ -2899,7 +2934,7 @@ namespace GGUFMeta {
2899
2934
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2900
2935
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2901
2936
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2902
- target = ovrd->bool_value;
2937
+ target = ovrd->val_bool;
2903
2938
  return true;
2904
2939
  }
2905
2940
  return false;
@@ -2909,7 +2944,7 @@ namespace GGUFMeta {
2909
2944
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2910
2945
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2911
2946
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2912
- target = ovrd->int_value;
2947
+ target = ovrd->val_i64;
2913
2948
  return true;
2914
2949
  }
2915
2950
  return false;
@@ -2919,7 +2954,7 @@ namespace GGUFMeta {
2919
2954
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2920
2955
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2921
2956
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2922
- target = ovrd->float_value;
2957
+ target = ovrd->val_f64;
2923
2958
  return true;
2924
2959
  }
2925
2960
  return false;
@@ -2928,12 +2963,11 @@ namespace GGUFMeta {
2928
2963
  template<typename OT>
2929
2964
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2930
2965
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2931
- (void)target;
2932
- (void)ovrd;
2933
- if (!ovrd) { return false; }
2934
- // Currently, we should never end up here so it would be a bug if we do.
2935
- throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2936
- ovrd ? ovrd->key : "NULL"));
2966
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
2967
+ target = ovrd->val_str;
2968
+ return true;
2969
+ }
2970
+ return false;
2937
2971
  }
2938
2972
 
2939
2973
  static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@@ -2966,6 +3000,7 @@ struct llama_model_loader {
2966
3000
  size_t n_bytes = 0;
2967
3001
 
2968
3002
  bool use_mmap = false;
3003
+ bool check_tensors;
2969
3004
 
2970
3005
  llama_files files;
2971
3006
  llama_ftype ftype;
@@ -2980,9 +3015,13 @@ struct llama_model_loader {
2980
3015
 
2981
3016
  ggml_tensor * tensor;
2982
3017
 
2983
- llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
3018
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2984
3019
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2985
3020
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
3021
+
3022
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
3023
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
3024
+ }
2986
3025
  }
2987
3026
  };
2988
3027
  std::vector<llama_tensor_weight> weights;
@@ -2995,7 +3034,7 @@ struct llama_model_loader {
2995
3034
  std::string arch_name;
2996
3035
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2997
3036
 
2998
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
3037
+ llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
2999
3038
  int trace = 0;
3000
3039
  if (getenv("LLAMA_TRACE")) {
3001
3040
  trace = atoi(getenv("LLAMA_TRACE"));
@@ -3021,15 +3060,15 @@ struct llama_model_loader {
3021
3060
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
3022
3061
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
3023
3062
 
3063
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
3064
+ contexts.emplace_back(ctx);
3065
+
3024
3066
  // Save tensors data offset of the main file.
3025
3067
  // For subsidiary files, `meta` tensor data offset must not be used,
3026
3068
  // so we build a unified tensors index for weights.
3027
3069
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3028
- weights.emplace_back(0, cur->name, meta, cur);
3070
+ weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
3029
3071
  }
3030
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
3031
- contexts.emplace_back(ctx);
3032
-
3033
3072
  uint16_t n_split = 0;
3034
3073
  get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
3035
3074
 
@@ -3063,12 +3102,13 @@ struct llama_model_loader {
3063
3102
  throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
3064
3103
  }
3065
3104
 
3105
+ files.emplace_back(new llama_file(split_path, "rb"));
3106
+ contexts.emplace_back(ctx);
3107
+
3066
3108
  // Save tensors data offset info of the shard.
3067
3109
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3068
- weights.emplace_back(idx, cur->name, ctx_gguf, cur);
3110
+ weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
3069
3111
  }
3070
- files.emplace_back(new llama_file(split_path, "rb"));
3071
- contexts.emplace_back(ctx);
3072
3112
 
3073
3113
  gguf_free(ctx_gguf);
3074
3114
  }
@@ -3091,9 +3131,17 @@ struct llama_model_loader {
3091
3131
 
3092
3132
  fver = (enum llama_fver) gguf_get_version(meta);
3093
3133
 
3134
+ std::set<std::string> tensor_names;
3094
3135
  for (auto & w : weights) {
3095
3136
  n_elements += ggml_nelements(w.tensor);
3096
3137
  n_bytes += ggml_nbytes(w.tensor);
3138
+ // make sure there is no duplicated tensor names
3139
+ const std::string name(w.tensor->name);
3140
+ auto found = tensor_names.find(name);
3141
+ if (found != tensor_names.end()) {
3142
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
3143
+ }
3144
+ tensor_names.insert(name);
3097
3145
  }
3098
3146
 
3099
3147
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -3199,6 +3247,7 @@ struct llama_model_loader {
3199
3247
  }
3200
3248
 
3201
3249
  this->use_mmap = use_mmap;
3250
+ this->check_tensors = check_tensors;
3202
3251
  }
3203
3252
 
3204
3253
  ~llama_model_loader() {
@@ -3278,6 +3327,10 @@ struct llama_model_loader {
3278
3327
  return nullptr;
3279
3328
  }
3280
3329
 
3330
+ const llama_tensor_weight * get_weight(int i) const {
3331
+ return get_weight(get_tensor_name(i));
3332
+ }
3333
+
3281
3334
  const llama_tensor_weight & require_weight(const char * name) const {
3282
3335
  const llama_tensor_weight * weight = get_weight(name);
3283
3336
  if (!weight) {
@@ -3453,6 +3506,10 @@ struct llama_model_loader {
3453
3506
  file->seek(w.offs, SEEK_SET);
3454
3507
  file->read_raw(cur->data, ggml_nbytes(cur));
3455
3508
  }
3509
+
3510
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
3511
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3512
+ }
3456
3513
  }
3457
3514
 
3458
3515
  size_t size_done = 0;
@@ -3469,6 +3526,8 @@ struct llama_model_loader {
3469
3526
  GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3470
3527
 
3471
3528
  std::vector<no_init<uint8_t>> read_buf;
3529
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3530
+
3472
3531
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3473
3532
  const auto * weight = get_weight(ggml_get_name(cur));
3474
3533
  if (weight == nullptr) {
@@ -3490,37 +3549,66 @@ struct llama_model_loader {
3490
3549
  if (bufs_mmap.count(weight->idx)) {
3491
3550
  buf_mmap = bufs_mmap.at(weight->idx);
3492
3551
  }
3552
+ uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
3553
+
3554
+ if (check_tensors) {
3555
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
3556
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
3557
+ }));
3558
+ }
3559
+
3493
3560
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3494
3561
  if (buf_mmap && cur->data == nullptr) {
3495
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3562
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
3496
3563
  if (lmlocks) {
3497
3564
  const auto & lmlock = lmlocks->at(weight->idx);
3498
- lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3565
+ lmlock->grow_to(weight->offs + n_size);
3499
3566
  }
3500
3567
 
3501
3568
  auto & mmap_used = mmaps_used[weight->idx];
3502
3569
  mmap_used.first = std::min(mmap_used.first, weight->offs);
3503
3570
  mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3504
3571
  } else {
3505
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3572
+ ggml_backend_tensor_set(cur, data, 0, n_size);
3506
3573
  }
3507
3574
  } else {
3508
3575
  GGML_ASSERT(weight->idx < files.size());
3509
3576
  const auto & file = files.at(weight->idx);
3510
3577
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3511
3578
  file->seek(weight->offs, SEEK_SET);
3512
- file->read_raw(cur->data, ggml_nbytes(cur));
3579
+ file->read_raw(cur->data, n_size);
3580
+ if (check_tensors) {
3581
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3582
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
3583
+ }));
3584
+ }
3513
3585
  } else {
3514
- read_buf.resize(ggml_nbytes(cur));
3586
+ read_buf.resize(n_size);
3515
3587
  file->seek(weight->offs, SEEK_SET);
3516
- file->read_raw(read_buf.data(), ggml_nbytes(cur));
3588
+ file->read_raw(read_buf.data(), n_size);
3517
3589
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3590
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3591
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3592
+ }
3518
3593
  }
3519
3594
  }
3520
3595
 
3521
3596
  size_done += n_size;
3522
3597
  }
3523
3598
 
3599
+ // check validation results
3600
+ bool validation_failed = false;
3601
+ for (auto & future : validation_result) {
3602
+ auto result = future.get();
3603
+ if (!result.second) {
3604
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
3605
+ validation_failed = true;
3606
+ }
3607
+ }
3608
+ if (validation_failed) {
3609
+ throw std::runtime_error("found tensors with invalid data");
3610
+ }
3611
+
3524
3612
  // check if this is the last call and do final cleanup
3525
3613
  if (size_done >= size_data) {
3526
3614
  // unmap offloaded tensors and metadata
@@ -3770,7 +3858,7 @@ static void llm_load_hparams(
3770
3858
  switch (hparams.n_layer) {
3771
3859
  case 22: model.type = e_model::MODEL_1B; break;
3772
3860
  case 26: model.type = e_model::MODEL_3B; break;
3773
- case 32: model.type = e_model::MODEL_7B; break;
3861
+ case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3774
3862
  case 40: model.type = e_model::MODEL_13B; break;
3775
3863
  case 48: model.type = e_model::MODEL_34B; break;
3776
3864
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3955,6 +4043,16 @@ static void llm_load_hparams(
3955
4043
  {
3956
4044
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3957
4045
 
4046
+ switch (hparams.n_layer) {
4047
+ case 24: model.type = e_model::MODEL_1B; break;
4048
+ case 32: model.type = e_model::MODEL_3B; break;
4049
+ default: model.type = e_model::MODEL_UNKNOWN;
4050
+ }
4051
+ } break;
4052
+ case LLM_ARCH_PHI3:
4053
+ {
4054
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4055
+
3958
4056
  switch (hparams.n_layer) {
3959
4057
  case 24: model.type = e_model::MODEL_1B; break;
3960
4058
  case 32: model.type = e_model::MODEL_3B; break;
@@ -4104,7 +4202,7 @@ static void llm_load_hparams(
4104
4202
  model.ftype = ml.ftype;
4105
4203
 
4106
4204
  if (hparams.f_max_alibi_bias > 0.0f) {
4107
- hparams.need_kq_pos = true;
4205
+ hparams.use_alibi = true;
4108
4206
  }
4109
4207
 
4110
4208
  hparams.rope_type = llama_rope_type(&model);
@@ -4127,11 +4225,13 @@ static void llm_load_vocab(
4127
4225
 
4128
4226
  // determine vocab type
4129
4227
  {
4130
- std::string tokenizer_name;
4228
+ std::string tokenizer_model;
4229
+ std::string tokenizer_pre;
4131
4230
 
4132
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
4231
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
4232
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4133
4233
 
4134
- if (tokenizer_name == "no_vocab") {
4234
+ if (tokenizer_model == "no_vocab") {
4135
4235
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
4136
4236
 
4137
4237
  // default special tokens
@@ -4145,7 +4245,7 @@ static void llm_load_vocab(
4145
4245
  vocab.linefeed_id = -1;
4146
4246
 
4147
4247
  return;
4148
- } else if (tokenizer_name == "llama") {
4248
+ } else if (tokenizer_model == "llama") {
4149
4249
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4150
4250
 
4151
4251
  // default special tokens
@@ -4179,7 +4279,10 @@ static void llm_load_vocab(
4179
4279
  vocab.special_prefix_id = 67;
4180
4280
  vocab.special_suffix_id = 69;
4181
4281
  vocab.special_middle_id = 68;
4182
- vocab.special_eot_id = 70;
4282
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4283
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4284
+ //vocab.special_eot_id = 70;
4285
+ vocab.special_eot_id = 107;
4183
4286
  }
4184
4287
  }
4185
4288
 
@@ -4187,9 +4290,27 @@ static void llm_load_vocab(
4187
4290
  if (add_space_prefix_keyidx != -1) {
4188
4291
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4189
4292
  } // The default value of add_space_prefix is true.
4190
- } else if (tokenizer_name == "gpt2") {
4191
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4293
+ } else if (tokenizer_model == "bert") {
4294
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4192
4295
 
4296
+ // default special tokens
4297
+ vocab.special_bos_id = -1;
4298
+ vocab.special_eos_id = -1;
4299
+ vocab.special_unk_id = 100;
4300
+ vocab.special_sep_id = 102;
4301
+ vocab.special_pad_id = 0;
4302
+ vocab.special_cls_id = 101;
4303
+ vocab.special_mask_id = 103;
4304
+ vocab.add_space_prefix = false;
4305
+ } else {
4306
+ if (tokenizer_model == "gpt2") {
4307
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4308
+ } else {
4309
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4310
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4311
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
4312
+ return;
4313
+ }
4193
4314
  // read bpe merges and populate bpe ranks
4194
4315
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4195
4316
  if (merges_keyidx == -1) {
@@ -4223,23 +4344,50 @@ static void llm_load_vocab(
4223
4344
  vocab.special_pad_id = -1;
4224
4345
  vocab.special_cls_id = -1;
4225
4346
  vocab.special_mask_id = -1;
4226
- } else if (tokenizer_name == "bert") {
4227
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4347
+ }
4228
4348
 
4229
- // default special tokens
4230
- vocab.special_bos_id = -1;
4231
- vocab.special_eos_id = -1;
4232
- vocab.special_unk_id = 100;
4233
- vocab.special_sep_id = 102;
4234
- vocab.special_pad_id = 0;
4235
- vocab.special_cls_id = 101;
4236
- vocab.special_mask_id = 103;
4237
- vocab.add_space_prefix = false;
4349
+ // for now, only BPE models have pre-tokenizers
4350
+ if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4351
+ if (tokenizer_pre.empty()) {
4352
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4353
+ LLAMA_LOG_WARN("%s: \n", __func__);
4354
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4355
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4356
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4357
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4358
+ LLAMA_LOG_WARN("%s: \n", __func__);
4359
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4360
+ } else if (
4361
+ tokenizer_pre == "default") {
4362
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4363
+ } else if (
4364
+ tokenizer_pre == "llama3" ||
4365
+ tokenizer_pre == "llama-v3" ||
4366
+ tokenizer_pre == "llama-bpe") {
4367
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4368
+ } else if (
4369
+ tokenizer_pre == "deepseek-llm") {
4370
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4371
+ } else if (
4372
+ tokenizer_pre == "deepseek-coder") {
4373
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4374
+ } else if (
4375
+ tokenizer_pre == "falcon") {
4376
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4377
+ } else if (
4378
+ tokenizer_pre == "mpt") {
4379
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4380
+ } else if (
4381
+ tokenizer_pre == "starcoder") {
4382
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4383
+ } else if (
4384
+ tokenizer_pre == "gpt-2") {
4385
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386
+ } else {
4387
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4388
+ }
4238
4389
  } else {
4239
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4240
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4241
-
4242
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4243
4391
  }
4244
4392
  }
4245
4393
 
@@ -4308,6 +4456,7 @@ static void llm_load_vocab(
4308
4456
  { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
4457
  { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4310
4458
  };
4459
+
4311
4460
  for (const auto & it : special_token_types) {
4312
4461
  const std::string & key = kv(std::get<0>(it));
4313
4462
  int32_t & id = std::get<1>(it);
@@ -4322,7 +4471,6 @@ static void llm_load_vocab(
4322
4471
  } else {
4323
4472
  id = new_id;
4324
4473
  }
4325
-
4326
4474
  }
4327
4475
 
4328
4476
  // Handle add_bos_token and add_eos_token
@@ -4336,6 +4484,28 @@ static void llm_load_vocab(
4336
4484
  vocab.special_add_eos = int(temp);
4337
4485
  }
4338
4486
  }
4487
+
4488
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
4489
+ //
4490
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
4491
+ // for now, we apply this workaround to find the EOT token based on its text
4492
+ if (vocab.special_eot_id == -1) {
4493
+ for (const auto & t : vocab.token_to_id) {
4494
+ if (
4495
+ // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
4496
+ // need to fix convert script
4497
+ //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
4498
+ (t.first == "<|eot_id|>" ||
4499
+ t.first == "<|im_end|>" ||
4500
+ t.first == "<|end|>" ||
4501
+ t.first == "<end_of_turn>"
4502
+ )
4503
+ ) {
4504
+ vocab.special_eot_id = t.second;
4505
+ break;
4506
+ }
4507
+ }
4508
+ }
4339
4509
  }
4340
4510
 
4341
4511
  // build special tokens cache
@@ -4498,14 +4668,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4498
4668
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4499
4669
 
4500
4670
  // special tokens
4501
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4502
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4503
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4504
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4505
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4506
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4507
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4508
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4671
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4672
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4673
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4674
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4675
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4676
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4677
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4678
+
4679
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4680
+ if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
4681
+ if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4682
+ if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4683
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4509
4684
  }
4510
4685
 
4511
4686
  // Returns false if cancelled by progress_callback
@@ -5346,6 +5521,33 @@ static bool llm_load_tensors(
5346
5521
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5347
5522
  }
5348
5523
  } break;
5524
+ case LLM_ARCH_PHI3:
5525
+ {
5526
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
5527
+
5528
+ // output
5529
+ {
5530
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
5531
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
5532
+ }
5533
+
5534
+ for (int i = 0; i < n_layer; ++i) {
5535
+ ggml_context* ctx_layer = ctx_for_layer(i);
5536
+ ggml_context* ctx_split = ctx_for_layer_split(i);
5537
+
5538
+ auto& layer = model.layers[i];
5539
+
5540
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5541
+
5542
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5543
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5544
+
5545
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5546
+
5547
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5548
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5549
+ }
5550
+ } break;
5349
5551
  case LLM_ARCH_PLAMO:
5350
5552
  {
5351
5553
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5880,7 +6082,7 @@ static bool llm_load_tensors(
5880
6082
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
5881
6083
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
5882
6084
  try {
5883
- llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
6085
+ llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
5884
6086
 
5885
6087
  model.hparams.vocab_only = params.vocab_only;
5886
6088
 
@@ -6009,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
6009
6211
  static void llm_build_kv_store(
6010
6212
  struct ggml_context * ctx,
6011
6213
  const llama_hparams & hparams,
6214
+ const llama_cparams & cparams,
6012
6215
  const llama_kv_cache & kv,
6013
6216
  struct ggml_cgraph * graph,
6014
6217
  struct ggml_tensor * k_cur,
6015
6218
  struct ggml_tensor * v_cur,
6016
- int64_t n_ctx,
6017
6219
  int32_t n_tokens,
6018
6220
  int32_t kv_head,
6019
6221
  const llm_build_cb & cb,
6020
6222
  int64_t il) {
6223
+ const int64_t n_ctx = cparams.n_ctx;
6224
+
6021
6225
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6022
6226
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6023
6227
 
6024
6228
  GGML_ASSERT(kv.size == n_ctx);
6025
6229
 
6026
- // compute the transposed [n_tokens, n_embd] V matrix
6027
- assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6028
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
6029
- cb(v_cur_t, "v_cur_t", il);
6030
-
6031
6230
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
6032
6231
  (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
6033
6232
  cb(k_cache_view, "k_cache_view", il);
6034
6233
 
6035
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6036
- ( n_ctx)*ggml_element_size(kv.v_l[il]),
6037
- (kv_head)*ggml_element_size(kv.v_l[il]));
6234
+ // note: storing RoPE-ed version of K in the KV cache
6235
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6236
+
6237
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6238
+
6239
+ struct ggml_tensor * v_cache_view = nullptr;
6240
+
6241
+ if (cparams.flash_attn) {
6242
+ v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
6243
+ (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
6244
+ } else {
6245
+ // note: the V cache is transposed when not using flash attention
6246
+ v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6247
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
6248
+ (kv_head)*ggml_element_size(kv.v_l[il]));
6249
+
6250
+ v_cur = ggml_transpose(ctx, v_cur);
6251
+ }
6038
6252
  cb(v_cache_view, "v_cache_view", il);
6039
6253
 
6040
- // important: storing RoPE-ed version of K in the KV cache!
6041
- ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6042
- ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
6254
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
6043
6255
  }
6044
6256
 
6045
6257
  static struct ggml_tensor * llm_build_norm(
@@ -6259,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
6259
6471
  return moe_out;
6260
6472
  }
6261
6473
 
6262
- // if max_alibi_bias > 0 then apply ALiBi
6263
6474
  static struct ggml_tensor * llm_build_kqv(
6264
6475
  struct ggml_context * ctx,
6265
6476
  const llama_model & model,
6266
6477
  const llama_hparams & hparams,
6478
+ const llama_cparams & cparams,
6267
6479
  const llama_kv_cache & kv,
6268
6480
  struct ggml_cgraph * graph,
6269
6481
  struct ggml_tensor * wo,
@@ -6271,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
6271
6483
  struct ggml_tensor * q_cur,
6272
6484
  struct ggml_tensor * kq_mask,
6273
6485
  struct ggml_tensor * kq_pos,
6274
- int64_t n_ctx,
6275
6486
  int32_t n_tokens,
6276
6487
  int32_t n_kv,
6277
6488
  float kq_scale,
6278
6489
  const llm_build_cb & cb,
6279
6490
  int il) {
6491
+ const int64_t n_ctx = cparams.n_ctx;
6280
6492
  const int64_t n_head = hparams.n_head;
6281
6493
  const int64_t n_head_kv = hparams.n_head_kv;
6282
6494
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -6294,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
6294
6506
  0);
6295
6507
  cb(k, "k", il);
6296
6508
 
6297
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6298
- cb(kq, "kq", il);
6509
+ struct ggml_tensor * cur;
6299
6510
 
6300
- if (model.arch == LLM_ARCH_PHI2) {
6301
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6302
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6303
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6304
- }
6511
+ if (cparams.flash_attn) {
6512
+ GGML_UNUSED(model);
6513
+ GGML_UNUSED(n_ctx);
6305
6514
 
6306
- if (model.arch == LLM_ARCH_GROK) {
6307
- // need to do the following:
6308
- // multiply by attn_output_multiplyer of 0.08838834764831845
6309
- // and then :
6310
- // kq = 30 * tanh(kq / 30)
6311
- // before the softmax below
6515
+ // note: if this assert triggers, then some check has failed earlier
6516
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6517
+ GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6312
6518
 
6313
- //try from phi2
6314
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6519
+ // split cached v into n_head heads (not transposed)
6520
+ struct ggml_tensor * v =
6521
+ ggml_view_3d(ctx, kv.v_l[il],
6522
+ n_embd_head_v, n_kv, n_head_kv,
6523
+ ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6524
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6525
+ 0);
6526
+ cb(v, "v", il);
6315
6527
 
6316
- kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6317
- kq = ggml_scale(ctx, kq, 30);
6318
- }
6528
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6529
+
6530
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6531
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6532
+ }
6533
+
6534
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6535
+ } else {
6536
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6537
+ cb(kq, "kq", il);
6538
+
6539
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6540
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6541
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6542
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6543
+ }
6544
+
6545
+ if (model.arch == LLM_ARCH_GROK) {
6546
+ // need to do the following:
6547
+ // multiply by attn_output_multiplyer of 0.08838834764831845
6548
+ // and then :
6549
+ // kq = 30 * tanh(kq / 30)
6550
+ // before the softmax below
6551
+
6552
+ //try from phi2
6553
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6554
+
6555
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6556
+ kq = ggml_scale(ctx, kq, 30);
6557
+ }
6319
6558
 
6320
6559
  #if defined(GGML_USE_KOMPUTE)
6321
6560
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6322
6561
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6323
6562
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6324
- if (hparams.f_max_alibi_bias > 0.0f) {
6325
- kq = ggml_scale(ctx, kq, kq_scale);
6326
- cb(kq, "kq_scaled", il);
6563
+ if (hparams.use_alibi) {
6564
+ kq = ggml_scale(ctx, kq, kq_scale);
6565
+ cb(kq, "kq_scaled", il);
6327
6566
 
6328
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6329
- cb(kq, "kq_scaled_alibi", il);
6567
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6568
+ cb(kq, "kq_scaled_alibi", il);
6330
6569
 
6331
- kq = ggml_add(ctx, kq, kq_mask);
6332
- cb(kq, "kq_masked", il);
6570
+ kq = ggml_add(ctx, kq, kq_mask);
6571
+ cb(kq, "kq_masked", il);
6333
6572
 
6334
- kq = ggml_soft_max(ctx, kq);
6335
- cb(kq, "kq_soft_max", il);
6336
- } else
6573
+ kq = ggml_soft_max(ctx, kq);
6574
+ cb(kq, "kq_soft_max", il);
6575
+ } else
6337
6576
  #endif
6338
- {
6339
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6340
- cb(kq, "kq_soft_max_ext", il);
6341
- }
6577
+ {
6578
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6579
+ cb(kq, "kq_soft_max_ext", il);
6580
+ }
6342
6581
 
6343
- GGML_ASSERT(kv.size == n_ctx);
6582
+ GGML_ASSERT(kv.size == n_ctx);
6344
6583
 
6345
- // split cached v into n_head heads
6346
- struct ggml_tensor * v =
6347
- ggml_view_3d(ctx, kv.v_l[il],
6348
- n_kv, n_embd_head_v, n_head_kv,
6349
- ggml_element_size(kv.v_l[il])*n_ctx,
6350
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6351
- 0);
6352
- cb(v, "v", il);
6584
+ // split cached v into n_head heads
6585
+ struct ggml_tensor * v =
6586
+ ggml_view_3d(ctx, kv.v_l[il],
6587
+ n_kv, n_embd_head_v, n_head_kv,
6588
+ ggml_element_size(kv.v_l[il])*n_ctx,
6589
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6590
+ 0);
6591
+ cb(v, "v", il);
6353
6592
 
6354
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6355
- cb(kqv, "kqv", il);
6593
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6594
+ cb(kqv, "kqv", il);
6356
6595
 
6357
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6358
- cb(kqv_merged, "kqv_merged", il);
6596
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6597
+ cb(kqv_merged, "kqv_merged", il);
6359
6598
 
6360
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6361
- cb(cur, "kqv_merged_cont", il);
6599
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6600
+ cb(cur, "kqv_merged_cont", il);
6601
+ }
6362
6602
 
6363
6603
  ggml_build_forward_expand(graph, cur);
6364
6604
 
@@ -6378,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
6378
6618
  struct ggml_context * ctx,
6379
6619
  const llama_model & model,
6380
6620
  const llama_hparams & hparams,
6621
+ const llama_cparams & cparams,
6381
6622
  const llama_kv_cache & kv,
6382
6623
  struct ggml_cgraph * graph,
6383
6624
  struct ggml_tensor * wo,
@@ -6387,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
6387
6628
  struct ggml_tensor * q_cur,
6388
6629
  struct ggml_tensor * kq_mask,
6389
6630
  struct ggml_tensor * kq_pos,
6390
- int64_t n_ctx,
6391
6631
  int32_t n_tokens,
6392
6632
  int32_t kv_head,
6393
6633
  int32_t n_kv,
@@ -6401,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
6401
6641
  ggml_build_forward_expand(graph, k_cur);
6402
6642
  ggml_build_forward_expand(graph, v_cur);
6403
6643
 
6404
- llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
6644
+ llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
6405
6645
 
6406
6646
  struct ggml_tensor * cur;
6407
6647
 
6408
- cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
6409
- q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
6648
+ cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6649
+ q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6410
6650
  cb(cur, "kqv_out", il);
6411
6651
 
6412
6652
  return cur;
@@ -6448,6 +6688,8 @@ struct llm_build_context {
6448
6688
  const int32_t kv_head; // index of where we store new KV data in the cache
6449
6689
  const int32_t n_orig_ctx;
6450
6690
 
6691
+ const bool flash_attn;
6692
+
6451
6693
  const enum llama_pooling_type pooling_type;
6452
6694
  const enum llama_rope_type rope_type;
6453
6695
 
@@ -6494,6 +6736,7 @@ struct llm_build_context {
6494
6736
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6495
6737
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6496
6738
  n_orig_ctx (cparams.n_yarn_orig_ctx),
6739
+ flash_attn (cparams.flash_attn),
6497
6740
  pooling_type (cparams.pooling_type),
6498
6741
  rope_type (hparams.rope_type),
6499
6742
  cb (cb),
@@ -6608,15 +6851,31 @@ struct llm_build_context {
6608
6851
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6609
6852
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
6610
6853
 
6611
- ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6612
- nm, n_embd_v_gqa,
6613
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6614
- ggml_row_size(kv_self.v_l[il]->type, i));
6854
+ ggml_tensor * view_v_src;
6855
+ ggml_tensor * view_v_dst;
6615
6856
 
6616
- ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6617
- nm, n_embd_v_gqa,
6618
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6619
- ggml_row_size(kv_self.v_l[il]->type, id));
6857
+ if (flash_attn) {
6858
+ // NOTE: the V cache is not transposed when using flash attention
6859
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6860
+ n_embd_v_gqa, nm,
6861
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6862
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
6863
+
6864
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6865
+ n_embd_v_gqa, nm,
6866
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6867
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
6868
+ } else {
6869
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6870
+ nm, n_embd_v_gqa,
6871
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6872
+ ggml_row_size(kv_self.v_l[il]->type, i));
6873
+
6874
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6875
+ nm, n_embd_v_gqa,
6876
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6877
+ ggml_row_size(kv_self.v_l[il]->type, id));
6878
+ }
6620
6879
 
6621
6880
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
6622
6881
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
@@ -6646,20 +6905,26 @@ struct llm_build_context {
6646
6905
 
6647
6906
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
6648
6907
  if (causal) {
6649
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
6908
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6650
6909
  } else {
6651
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6910
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6652
6911
  }
6653
6912
  cb(lctx.inp_KQ_mask, "KQ_mask", -1);
6654
6913
  ggml_set_input(lctx.inp_KQ_mask);
6655
- return lctx.inp_KQ_mask;
6914
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6656
6915
  }
6657
6916
 
6658
- struct ggml_tensor * build_inp_KQ_pos() {
6659
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6917
+ struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6918
+ if (causal) {
6919
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6920
+ } else {
6921
+ // TODO: this will be needed for ALiBi-based BERT models
6922
+ // https://github.com/ggerganov/llama.cpp/pull/6826
6923
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6924
+ }
6660
6925
  cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6661
6926
  ggml_set_input(lctx.inp_KQ_pos);
6662
- return lctx.inp_KQ_pos;
6927
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6663
6928
  }
6664
6929
 
6665
6930
  struct ggml_tensor * build_inp_mean() {
@@ -6765,9 +7030,9 @@ struct llm_build_context {
6765
7030
  );
6766
7031
  cb(Kcur, "Kcur", il);
6767
7032
 
6768
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7033
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6769
7034
  model.layers[il].wo, model.layers[il].bo,
6770
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7035
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6771
7036
  }
6772
7037
 
6773
7038
  if (il == n_layer - 1) {
@@ -6905,9 +7170,9 @@ struct llm_build_context {
6905
7170
  cb(Qcur, "Qcur", il);
6906
7171
  cb(Kcur, "Kcur", il);
6907
7172
 
6908
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7173
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6909
7174
  model.layers[il].wo, NULL,
6910
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7175
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6911
7176
  }
6912
7177
 
6913
7178
  if (il == n_layer - 1) {
@@ -7012,9 +7277,9 @@ struct llm_build_context {
7012
7277
  ext_factor, attn_factor, beta_fast, beta_slow
7013
7278
  );
7014
7279
  cb(Kcur, "Kcur", il);
7015
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7016
7281
  model.layers[il].wo, NULL,
7017
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7282
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7018
7283
  }
7019
7284
 
7020
7285
  if (il == n_layer - 1) {
@@ -7132,9 +7397,9 @@ struct llm_build_context {
7132
7397
  );
7133
7398
  cb(Kcur, "Kcur", il);
7134
7399
 
7135
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7400
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7136
7401
  model.layers[il].wo, NULL,
7137
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7402
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7138
7403
  }
7139
7404
 
7140
7405
  if (il == n_layer - 1) {
@@ -7257,9 +7522,9 @@ struct llm_build_context {
7257
7522
  );
7258
7523
  cb(Kcur, "Kcur", il);
7259
7524
 
7260
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7525
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7261
7526
  model.layers[il].wo, model.layers[il].bo,
7262
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7527
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7263
7528
  }
7264
7529
 
7265
7530
  if (il == n_layer - 1) {
@@ -7409,9 +7674,9 @@ struct llm_build_context {
7409
7674
  );
7410
7675
  cb(Kcur, "Kcur", il);
7411
7676
 
7412
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7413
- model.layers[il].wo, NULL,
7414
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7677
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7678
+ model.layers[il].wo, NULL,
7679
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7415
7680
  }
7416
7681
 
7417
7682
  if (il == n_layer - 1) {
@@ -7521,9 +7786,9 @@ struct llm_build_context {
7521
7786
 
7522
7787
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7523
7788
 
7524
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7789
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7525
7790
  model.layers[il].wo, model.layers[il].bo,
7526
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7791
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7527
7792
  }
7528
7793
 
7529
7794
  if (il == n_layer - 1) {
@@ -7725,9 +7990,9 @@ struct llm_build_context {
7725
7990
  );
7726
7991
  cb(Vcur, "Vcur", il);
7727
7992
 
7728
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7993
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7729
7994
  model.layers[il].wo, model.layers[il].bo,
7730
- Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7995
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7731
7996
  }
7732
7997
 
7733
7998
  if (il == n_layer - 1) {
@@ -7821,9 +8086,9 @@ struct llm_build_context {
7821
8086
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7822
8087
  cb(Qcur, "Qcur", il);
7823
8088
 
7824
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8089
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7825
8090
  model.layers[il].wo, NULL,
7826
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8091
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7827
8092
  }
7828
8093
 
7829
8094
  if (il == n_layer - 1) {
@@ -8114,9 +8379,9 @@ struct llm_build_context {
8114
8379
 
8115
8380
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8116
8381
 
8117
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8382
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8118
8383
  model.layers[il].wo, model.layers[il].bo,
8119
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8384
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8120
8385
  }
8121
8386
 
8122
8387
  if (il == n_layer - 1) {
@@ -8245,14 +8510,15 @@ struct llm_build_context {
8245
8510
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8246
8511
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8247
8512
 
8248
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8249
- model.layers[il].wo, model.layers[il].bo,
8250
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8513
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8514
+ model.layers[il].wo, model.layers[il].bo,
8515
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8251
8516
  } else {
8252
8517
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8253
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8518
+
8519
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8254
8520
  model.layers[il].wo, model.layers[il].bo,
8255
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8521
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8256
8522
  }
8257
8523
  }
8258
8524
 
@@ -8394,9 +8660,9 @@ struct llm_build_context {
8394
8660
  );
8395
8661
  cb(Kcur, "Kcur", il);
8396
8662
 
8397
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8663
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8398
8664
  model.layers[il].wo, NULL,
8399
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8665
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8400
8666
  }
8401
8667
 
8402
8668
  if (il == n_layer - 1) {
@@ -8512,9 +8778,9 @@ struct llm_build_context {
8512
8778
  );
8513
8779
  cb(Kcur, "Kcur", il);
8514
8780
 
8515
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8781
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8516
8782
  model.layers[il].wo, NULL,
8517
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8783
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8518
8784
  }
8519
8785
 
8520
8786
  if (il == n_layer - 1) {
@@ -8625,9 +8891,9 @@ struct llm_build_context {
8625
8891
  );
8626
8892
  cb(Kcur, "Kcur", il);
8627
8893
 
8628
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8894
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8629
8895
  model.layers[il].wo, model.layers[il].bo,
8630
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8896
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8631
8897
  }
8632
8898
 
8633
8899
  if (il == n_layer - 1) {
@@ -8739,9 +9005,9 @@ struct llm_build_context {
8739
9005
  );
8740
9006
  cb(Kcur, "Kcur", il);
8741
9007
 
8742
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9008
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8743
9009
  model.layers[il].wo, model.layers[il].bo,
8744
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9010
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8745
9011
  }
8746
9012
 
8747
9013
  if (il == n_layer - 1) {
@@ -8894,9 +9160,9 @@ struct llm_build_context {
8894
9160
  );
8895
9161
  cb(Kcur, "Kcur", il);
8896
9162
 
8897
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9163
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8898
9164
  model.layers[il].wo, model.layers[il].bo,
8899
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9165
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8900
9166
  }
8901
9167
 
8902
9168
  if (il == n_layer - 1) {
@@ -8938,12 +9204,140 @@ struct llm_build_context {
8938
9204
 
8939
9205
  cur = ggml_add(ctx0, cur, model.output_b);
8940
9206
  cb(cur, "result_output", -1);
9207
+ ggml_build_forward_expand(gf, cur);
9208
+ return gf;
9209
+ }
9210
+
9211
+ struct ggml_cgraph * build_phi3() {
9212
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9213
+
9214
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9215
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
9216
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9217
+
9218
+ struct ggml_tensor * cur;
9219
+ struct ggml_tensor * inpL;
9220
+
9221
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
9222
+
9223
+ // inp_pos - contains the positions
9224
+ struct ggml_tensor * inp_pos = build_inp_pos();
9225
+
9226
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9227
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9228
+
9229
+ for (int il = 0; il < n_layer; ++il) {
9230
+ auto residual = inpL;
9231
+
9232
+ // self-attention
9233
+ {
9234
+ struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9235
+ model.layers[il].attn_norm,
9236
+ NULL,
9237
+ LLM_NORM_RMS, cb, il);
9238
+ cb(attn_norm_output, "attn_norm", il);
9239
+
9240
+ struct ggml_tensor * Qcur = nullptr;
9241
+ struct ggml_tensor * Kcur = nullptr;
9242
+ struct ggml_tensor * Vcur = nullptr;
9243
+
9244
+ if (model.layers[il].wqkv) {
9245
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
9246
+ cb(cur, "wqkv", il);
9247
+
9248
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
9249
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
9250
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9251
+ }
9252
+ else {
9253
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9254
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9255
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9256
+ }
9257
+
9258
+ cb(Qcur, "Qcur", il);
9259
+ cb(Kcur, "Kcur", il);
9260
+ cb(Vcur, "Vcur", il);
9261
+
9262
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9263
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9264
+
9265
+ Qcur = ggml_rope_custom(
9266
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9267
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9268
+ );
9269
+ cb(Qcur, "Qcur", il);
9270
+
9271
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9272
+ cb(Qcur, "Qcur", il);
9273
+
9274
+ Kcur = ggml_rope_custom(
9275
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9276
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9277
+ );
9278
+ cb(Kcur, "Kcur", il);
9279
+
9280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9281
+ model.layers[il].wo, model.layers[il].bo,
9282
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9283
+ }
9284
+
9285
+ if (il == n_layer - 1) {
9286
+ // skip computing output for unused tokens
9287
+ struct ggml_tensor* inp_out_ids = build_inp_out_ids();
9288
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9289
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
9290
+ }
9291
+
9292
+ cur = ggml_add(ctx0, cur, residual);
9293
+ residual = cur;
9294
+
9295
+ cur = llm_build_norm(ctx0, cur, hparams,
9296
+ model.layers[il].ffn_norm, NULL,
9297
+ LLM_NORM_RMS, cb, il);
9298
+ cb(cur, "ffn_norm", il);
9299
+
9300
+ // FF
9301
+ // special-case: the up and gate tensors are merged into a single tensor
9302
+ // TOOD: support into llm_build_ffn
9303
+ {
9304
+ struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
9305
+ cb(up, "ffn_up", il);
9306
+
9307
+ auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
9308
+ auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
9309
+
9310
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
9311
+ cb(y, "ffn_gate", il);
9312
+
9313
+ auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
9314
+ cb(down, "ffn_down", il);
9315
+
9316
+ cur = down;
9317
+ cb(cur, "ffn_out", il);
9318
+ }
9319
+
9320
+ cur = ggml_add(ctx0, residual, cur);
9321
+ cb(cur, "l_out", il);
9322
+
9323
+ inpL = cur;
9324
+ }
9325
+
9326
+ cur = llm_build_norm(ctx0, inpL, hparams,
9327
+ model.output_norm,
9328
+ NULL,
9329
+ LLM_NORM_RMS, cb, -1);
9330
+ cb(cur, "result_norm", -1);
9331
+
9332
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9333
+ cb(cur, "result_output", -1);
8941
9334
 
8942
9335
  ggml_build_forward_expand(gf, cur);
8943
9336
 
8944
9337
  return gf;
8945
9338
  }
8946
9339
 
9340
+
8947
9341
  struct ggml_cgraph * build_plamo() {
8948
9342
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
8949
9343
 
@@ -8996,9 +9390,9 @@ struct llm_build_context {
8996
9390
  ext_factor, attn_factor, beta_fast, beta_slow);
8997
9391
  cb(Kcur, "Kcur", il);
8998
9392
 
8999
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9393
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9000
9394
  model.layers[il].wo, NULL,
9001
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9395
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9002
9396
  }
9003
9397
  struct ggml_tensor * sa_out = cur;
9004
9398
 
@@ -9099,9 +9493,9 @@ struct llm_build_context {
9099
9493
 
9100
9494
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9101
9495
 
9102
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9103
9497
  model.layers[il].wo, model.layers[il].bo,
9104
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9105
9499
  }
9106
9500
 
9107
9501
  if (il == n_layer - 1) {
@@ -9206,9 +9600,9 @@ struct llm_build_context {
9206
9600
  );
9207
9601
  cb(Kcur, "Kcur", il);
9208
9602
 
9209
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9603
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9210
9604
  model.layers[il].wo, model.layers[il].bo,
9211
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9605
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9212
9606
  }
9213
9607
 
9214
9608
  if (il == n_layer - 1) {
@@ -9322,9 +9716,9 @@ struct llm_build_context {
9322
9716
  );
9323
9717
  cb(Kcur, "Kcur", il);
9324
9718
 
9325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9719
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9326
9720
  model.layers[il].wo, NULL,
9327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9721
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9328
9722
  }
9329
9723
 
9330
9724
  if (il == n_layer - 1) {
@@ -9439,9 +9833,9 @@ struct llm_build_context {
9439
9833
  );
9440
9834
  cb(Kcur, "Kcur", il);
9441
9835
 
9442
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9836
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9443
9837
  model.layers[il].wo, model.layers[il].bo,
9444
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9838
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9445
9839
  }
9446
9840
 
9447
9841
  if (il == n_layer - 1) {
@@ -9569,9 +9963,9 @@ struct llm_build_context {
9569
9963
  );
9570
9964
  cb(Kcur, "Kcur", il);
9571
9965
 
9572
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9966
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9573
9967
  model.layers[il].wo, model.layers[il].bo,
9574
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9968
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9575
9969
  }
9576
9970
 
9577
9971
  if (il == n_layer - 1) {
@@ -9690,9 +10084,9 @@ struct llm_build_context {
9690
10084
  ext_factor, attn_factor, beta_fast, beta_slow);
9691
10085
  cb(Kcur, "Kcur", il);
9692
10086
 
9693
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10087
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9694
10088
  model.layers[il].wo, NULL,
9695
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10089
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9696
10090
  }
9697
10091
 
9698
10092
  if (il == n_layer - 1) {
@@ -9809,9 +10203,9 @@ struct llm_build_context {
9809
10203
  );
9810
10204
  cb(Kcur, "Kcur", il);
9811
10205
 
9812
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10206
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9813
10207
  model.layers[il].wo, model.layers[il].bo,
9814
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10208
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9815
10209
  }
9816
10210
 
9817
10211
  if (il == n_layer - 1) {
@@ -10099,9 +10493,9 @@ struct llm_build_context {
10099
10493
  );
10100
10494
  cb(Kcur, "Kcur", il);
10101
10495
 
10102
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10103
10497
  model.layers[il].wo, model.layers[il].bo,
10104
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10105
10499
  }
10106
10500
 
10107
10501
  if (il == n_layer - 1) {
@@ -10230,9 +10624,9 @@ struct llm_build_context {
10230
10624
  );
10231
10625
  cb(Kcur, "Kcur", il);
10232
10626
 
10233
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10627
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10234
10628
  model.layers[il].wo, nullptr,
10235
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10629
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10236
10630
  }
10237
10631
 
10238
10632
  if (il == n_layer - 1) {
@@ -10445,6 +10839,10 @@ static struct ggml_cgraph * llama_build_graph(
10445
10839
  {
10446
10840
  result = llm.build_phi2();
10447
10841
  } break;
10842
+ case LLM_ARCH_PHI3:
10843
+ {
10844
+ result = llm.build_phi3();
10845
+ } break;
10448
10846
  case LLM_ARCH_PLAMO:
10449
10847
  {
10450
10848
  result = llm.build_plamo();
@@ -10655,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
10655
11053
  }
10656
11054
  }
10657
11055
 
10658
- if (hparams.need_kq_pos) {
11056
+ // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11057
+ // this allows to process multiple sequences in parallel with ALiBi-based models
11058
+ if (hparams.use_alibi) {
10659
11059
  const int64_t n_kv = kv_self.n;
10660
11060
 
10661
11061
  GGML_ASSERT(lctx.inp_KQ_pos);
@@ -11037,7 +11437,7 @@ static int llama_decode_internal(
11037
11437
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11038
11438
  // after enough generations, the benefit from this heuristic disappears
11039
11439
  // if we start defragmenting the cache, the benefit from this will be more important
11040
- kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
11440
+ kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11041
11441
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11042
11442
  }
11043
11443
  }
@@ -11205,6 +11605,10 @@ static int llama_decode_internal(
11205
11605
  }
11206
11606
  }
11207
11607
 
11608
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
11609
+ // overlap with device computation.
11610
+ ggml_backend_sched_reset(lctx.sched);
11611
+
11208
11612
  return 0;
11209
11613
  }
11210
11614
 
@@ -11230,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
11230
11634
  // each move requires 6*n_layer tensors (see build_defrag)
11231
11635
  // - source view, destination view, copy operation
11232
11636
  // - x2 for keys and values
11233
- const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11637
+ //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11638
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11639
+ const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
11234
11640
 
11235
11641
  // determine which KV cells to move where
11236
11642
  //
@@ -11554,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11554
11960
  }
11555
11961
  case LLAMA_VOCAB_TYPE_BPE: {
11556
11962
  GGML_ASSERT(false);
11557
- return unicode_utf8_to_byte(token_data.text);
11963
+ return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
11558
11964
  }
11559
11965
  case LLAMA_VOCAB_TYPE_WPM: {
11560
11966
  GGML_ASSERT(false);
@@ -11776,7 +12182,79 @@ struct llm_tokenizer_bpe {
11776
12182
 
11777
12183
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
11778
12184
  int final_prev_index = -1;
11779
- auto word_collection = bpe_gpt2_preprocess(text);
12185
+
12186
+ std::vector<std::string> word_collection;
12187
+ switch (vocab.type) {
12188
+ case LLAMA_VOCAB_TYPE_BPE:
12189
+ switch (vocab.type_pre) {
12190
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12191
+ word_collection = unicode_regex_split(text, {
12192
+ // original regex from tokenizer.json
12193
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12194
+
12195
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12196
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12197
+ });
12198
+ break;
12199
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12200
+ word_collection = unicode_regex_split(text, {
12201
+ "[\r\n]",
12202
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12203
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12204
+ "\\s+$",
12205
+ "[一-龥ࠀ-一가-퟿]+",
12206
+ "\\p{N}+",
12207
+ });
12208
+ break;
12209
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12210
+ word_collection = unicode_regex_split(text, {
12211
+ "[\r\n]",
12212
+ "\\s?\\p{L}+",
12213
+ "\\s?\\p{P}+",
12214
+ "[一-龥ࠀ-一가-퟿]+",
12215
+ "\\p{N}+",
12216
+ });
12217
+ break;
12218
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
12219
+ word_collection = unicode_regex_split(text, {
12220
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12221
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222
+ "\\p{N}+",
12223
+ "[0-9][0-9][0-9]",
12224
+ });
12225
+ break;
12226
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
12227
+ // TODO: MPT pre-tokenization regexes are unknown
12228
+ // the following are close, but not exact. run the following:
12229
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12230
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12231
+ word_collection = unicode_regex_split(text, {
12232
+ "\\s?\\p{L}+",
12233
+ "\\s?\\p{P}+",
12234
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12235
+ });
12236
+ break;
12237
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12238
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
12239
+ word_collection = unicode_regex_split(text, {
12240
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
+ });
12242
+ break;
12243
+ default:
12244
+ // default regex for BPE tokenization pre-processing
12245
+ word_collection = unicode_regex_split(text, {
12246
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12247
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12248
+ "\\p{N}+",
12249
+ "[0-9][0-9][0-9]",
12250
+ });
12251
+ break;
12252
+ }
12253
+ break;
12254
+ default:
12255
+ GGML_ASSERT(false);
12256
+ break;
12257
+ }
11780
12258
 
11781
12259
  symbols_final.clear();
11782
12260
 
@@ -11903,145 +12381,6 @@ private:
11903
12381
  work_queue.push(bigram);
11904
12382
  }
11905
12383
 
11906
- std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
11907
- std::vector<std::string> bpe_words;
11908
- std::vector<std::string> bpe_encoded_words;
11909
-
11910
- std::string token = "";
11911
- // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
11912
- bool collecting_numeric = false;
11913
- bool collecting_letter = false;
11914
- bool collecting_special = false;
11915
- bool collecting_whitespace_lookahead = false;
11916
- bool collecting = false;
11917
-
11918
- std::vector<std::string> text_utf;
11919
- text_utf.reserve(text.size());
11920
- bpe_words.reserve(text.size());
11921
- bpe_encoded_words.reserve(text.size());
11922
-
11923
- const auto cpts = unicode_cpts_from_utf8(text);
11924
- for (size_t i = 0; i < cpts.size(); ++i)
11925
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
11926
-
11927
- for (int i = 0; i < (int)text_utf.size(); i++) {
11928
- const std::string & utf_char = text_utf[i];
11929
- bool split_condition = false;
11930
- int bytes_remain = text_utf.size() - i;
11931
- // forward backward lookups
11932
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
11933
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
11934
-
11935
- // handling contractions
11936
- if (!split_condition && bytes_remain >= 2) {
11937
- // 's|'t|'m|'d
11938
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
11939
- split_condition = true;
11940
- }
11941
- if (split_condition) {
11942
- if (token.size()) {
11943
- bpe_words.emplace_back(token); // push previous content as token
11944
- }
11945
- token = utf_char + utf_char_next;
11946
- bpe_words.emplace_back(token);
11947
- token = "";
11948
- i++;
11949
- continue;
11950
- }
11951
- }
11952
- if (!split_condition && bytes_remain >= 3) {
11953
- // 're|'ve|'ll
11954
- if (utf_char == "\'" && (
11955
- (utf_char_next == "r" && utf_char_next_next == "e") ||
11956
- (utf_char_next == "v" && utf_char_next_next == "e") ||
11957
- (utf_char_next == "l" && utf_char_next_next == "l"))
11958
- ) {
11959
- split_condition = true;
11960
- }
11961
- if (split_condition) {
11962
- // current token + next token can be defined
11963
- if (token.size()) {
11964
- bpe_words.emplace_back(token); // push previous content as token
11965
- }
11966
- token = utf_char + utf_char_next + utf_char_next_next;
11967
- bpe_words.emplace_back(token); // the contraction
11968
- token = "";
11969
- i += 2;
11970
- continue;
11971
- }
11972
- }
11973
-
11974
- if (!split_condition && !collecting) {
11975
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
11976
- collecting_letter = true;
11977
- collecting = true;
11978
- }
11979
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
11980
- collecting_numeric = true;
11981
- collecting = true;
11982
- }
11983
- else if (
11984
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
11985
- (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
11986
- ) {
11987
- collecting_special = true;
11988
- collecting = true;
11989
- }
11990
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
11991
- collecting_whitespace_lookahead = true;
11992
- collecting = true;
11993
- }
11994
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
11995
- split_condition = true;
11996
- }
11997
- }
11998
- else if (!split_condition && collecting) {
11999
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
12000
- split_condition = true;
12001
- }
12002
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
12003
- split_condition = true;
12004
- }
12005
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
12006
- split_condition = true;
12007
- }
12008
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12009
- split_condition = true;
12010
- }
12011
- }
12012
-
12013
- if (utf_char_next == "") {
12014
- split_condition = true; // final
12015
- token += utf_char;
12016
- }
12017
-
12018
- if (split_condition) {
12019
- if (token.size()) {
12020
- bpe_words.emplace_back(token);
12021
- }
12022
- token = utf_char;
12023
- collecting = false;
12024
- collecting_letter = false;
12025
- collecting_numeric = false;
12026
- collecting_special = false;
12027
- collecting_whitespace_lookahead = false;
12028
- }
12029
- else {
12030
- token += utf_char;
12031
- }
12032
- }
12033
-
12034
- for (std::string & word : bpe_words) {
12035
- std::string encoded_token = "";
12036
- for (char & c : word) {
12037
- encoded_token += unicode_byte_to_utf8(c);
12038
- }
12039
- bpe_encoded_words.emplace_back(encoded_token);
12040
- }
12041
-
12042
- return bpe_encoded_words;
12043
- }
12044
-
12045
12384
  const llama_vocab & vocab;
12046
12385
 
12047
12386
  std::vector<llm_symbol> symbols;
@@ -12361,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12361
12700
  } break;
12362
12701
  case LLAMA_VOCAB_TYPE_BPE:
12363
12702
  {
12364
- if (add_special && vocab.special_add_bos == 1) {
12703
+ if (add_special && vocab.special_add_bos != 0) {
12365
12704
  GGML_ASSERT(vocab.special_bos_id != -1);
12366
12705
  output.push_back(vocab.special_bos_id);
12367
12706
  }
@@ -13268,16 +13607,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13268
13607
  GGML_ASSERT(ctx);
13269
13608
  const int64_t t_start_sample_us = ggml_time_us();
13270
13609
 
13271
- bool allow_eos = false;
13610
+ bool allow_eog = false;
13272
13611
  for (const auto & stack : grammar->stacks) {
13273
13612
  if (stack.empty()) {
13274
- allow_eos = true;
13613
+ allow_eog = true;
13275
13614
  break;
13276
13615
  }
13277
13616
  }
13278
13617
 
13279
- const llama_token eos = llama_token_eos(&ctx->model);
13280
-
13281
13618
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
13282
13619
  candidates_decoded.reserve(candidates->size);
13283
13620
  std::vector<llama_grammar_candidate> candidates_grammar;
@@ -13285,9 +13622,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13285
13622
 
13286
13623
  for (size_t i = 0; i < candidates->size; ++i) {
13287
13624
  const llama_token id = candidates->data[i].id;
13288
- const std::string piece = llama_token_to_piece(ctx, id);
13289
- if (id == eos) {
13290
- if (!allow_eos) {
13625
+ const std::string piece = llama_token_to_piece(ctx, id, false);
13626
+
13627
+ if (llama_token_is_eog(&ctx->model, id)) {
13628
+ if (!allow_eog) {
13291
13629
  candidates->data[i].logit = -INFINITY;
13292
13630
  }
13293
13631
  } else if (piece.empty() || piece[0] == 0) {
@@ -13450,7 +13788,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
13450
13788
  return result;
13451
13789
  }
13452
13790
 
13453
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13791
+ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
13454
13792
  GGML_ASSERT(ctx);
13455
13793
 
13456
13794
  const int64_t t_start_sample_us = ggml_time_us();
@@ -13463,7 +13801,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13463
13801
  }
13464
13802
 
13465
13803
  std::discrete_distribution<> dist(probs.begin(), probs.end());
13466
- auto & rng = ctx->rng;
13467
13804
  int idx = dist(rng);
13468
13805
 
13469
13806
  llama_token result = candidates->data[idx].id;
@@ -13473,10 +13810,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13473
13810
  return result;
13474
13811
  }
13475
13812
 
13813
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13814
+ return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
13815
+ }
13816
+
13476
13817
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
13477
13818
  const int64_t t_start_sample_us = ggml_time_us();
13478
13819
 
13479
- if (token == llama_token_eos(&ctx->model)) {
13820
+ if (llama_token_is_eog(&ctx->model, token)) {
13480
13821
  for (const auto & stack : grammar->stacks) {
13481
13822
  if (stack.empty()) {
13482
13823
  return;
@@ -13485,7 +13826,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
13485
13826
  GGML_ASSERT(false);
13486
13827
  }
13487
13828
 
13488
- const std::string piece = llama_token_to_piece(ctx, token);
13829
+ const std::string piece = llama_token_to_piece(ctx, token, false);
13489
13830
 
13490
13831
  // Note terminating 0 in decoded string
13491
13832
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14131,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14131
14472
  }
14132
14473
 
14133
14474
  static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14134
- std::mutex mutex;
14135
- int64_t counter = 0;
14136
- size_t new_size = 0;
14137
14475
  if (nthread < 2) {
14138
14476
  // single-thread
14139
- return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14477
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14478
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
14479
+ throw std::runtime_error("quantized data validation failed");
14480
+ }
14481
+ return new_size;
14140
14482
  }
14141
- auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
14483
+
14484
+ std::mutex mutex;
14485
+ int64_t counter = 0;
14486
+ size_t new_size = 0;
14487
+ bool valid = true;
14488
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
14142
14489
  nrows, n_per_row, imatrix]() {
14143
14490
  const int64_t nrows_per_chunk = chunk_size / n_per_row;
14144
14491
  size_t local_size = 0;
@@ -14153,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14153
14500
  }
14154
14501
  lock.unlock();
14155
14502
  const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14156
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14503
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14504
+ local_size += this_size;
14505
+
14506
+ // validate the quantized data
14507
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
14508
+ void * this_data = (char *) new_data + first_row * row_size;
14509
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
14510
+ std::unique_lock<std::mutex> lock(mutex);
14511
+ valid = false;
14512
+ break;
14513
+ }
14157
14514
  }
14158
14515
  };
14159
14516
  for (int it = 0; it < nthread - 1; ++it) {
@@ -14162,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14162
14519
  compute();
14163
14520
  for (auto & w : workers) { w.join(); }
14164
14521
  workers.clear();
14522
+ if (!valid) {
14523
+ throw std::runtime_error("quantized data validation failed");
14524
+ }
14165
14525
  return new_size;
14166
14526
  }
14167
14527
 
@@ -14224,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14224
14584
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
14225
14585
  kv_overrides = v->data();
14226
14586
  }
14227
- llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
14587
+ llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
14228
14588
  ml.init_mappings(false); // no prefetching
14229
14589
 
14230
14590
  llama_model model;
@@ -14262,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14262
14622
  for (auto & o : overrides) {
14263
14623
  if (o.key[0] == 0) break;
14264
14624
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
14265
- gguf_set_val_f32(ctx_out, o.key, o.float_value);
14625
+ gguf_set_val_f32(ctx_out, o.key, o.val_f64);
14266
14626
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
14267
- gguf_set_val_i32(ctx_out, o.key, o.int_value);
14627
+ gguf_set_val_i32(ctx_out, o.key, o.val_i64);
14268
14628
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
14269
- gguf_set_val_bool(ctx_out, o.key, o.bool_value);
14629
+ gguf_set_val_bool(ctx_out, o.key, o.val_bool);
14630
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
14631
+ gguf_set_val_str(ctx_out, o.key, o.val_str);
14270
14632
  } else {
14271
14633
  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
14272
14634
  }
@@ -14308,26 +14670,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14308
14670
  std::vector<no_init<uint8_t>> work;
14309
14671
  std::vector<no_init<float>> f32_conv_buf;
14310
14672
 
14673
+ uint16_t n_split = 1;
14674
+ // Assume split index is continuous
14675
+ if (params->keep_split) {
14676
+ for (int i = 0; i < ml.n_tensors; ++i) {
14677
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14678
+ }
14679
+ }
14680
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14681
+ ctx_outs[0] = ctx_out;
14682
+
14311
14683
  // populate the original tensors so we get an initial meta data
14312
14684
  for (int i = 0; i < ml.n_tensors; ++i) {
14313
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
14314
- gguf_add_tensor(ctx_out, meta);
14685
+ auto weight = ml.get_weight(i);
14686
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14687
+ struct ggml_tensor * tensor = weight->tensor;
14688
+ if (ctx_outs[i_split] == NULL) {
14689
+ ctx_outs[i_split] = gguf_init_empty();
14690
+ }
14691
+ gguf_add_tensor(ctx_outs[i_split], tensor);
14315
14692
  }
14316
14693
 
14317
- std::ofstream fout(fname_out, std::ios::binary);
14318
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14319
-
14320
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14694
+ // Set split info if needed
14695
+ if (n_split > 1) {
14696
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14697
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14698
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14699
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14700
+ }
14701
+ }
14321
14702
 
14322
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14703
+ int cur_split = -1;
14704
+ std::ofstream fout;
14705
+ auto close_ofstream = [&]() {
14706
+ // Write metadata and close file handler
14707
+ if (fout.is_open()) {
14708
+ fout.seekp(0);
14709
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14710
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14711
+ fout.write((const char *) data.data(), data.size());
14712
+ fout.close();
14713
+ }
14714
+ };
14715
+ auto new_ofstream = [&](int index) {
14716
+ cur_split = index;
14717
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14718
+ std::string fname = fname_out;
14719
+ if (params->keep_split) {
14720
+ char split_path[PATH_MAX] = {0};
14721
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14722
+ fname = std::string(split_path);
14723
+ }
14323
14724
 
14324
- // placeholder for the meta data
14325
- ::zeros(fout, meta_size);
14725
+ fout = std::ofstream(fname, std::ios::binary);
14726
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14727
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14728
+ // placeholder for the meta data
14729
+ ::zeros(fout, meta_size);
14730
+ };
14326
14731
 
14327
14732
  const auto tn = LLM_TN(model.arch);
14328
-
14733
+ new_ofstream(0);
14329
14734
  for (int i = 0; i < ml.n_tensors; ++i) {
14330
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14735
+ auto weight = ml.get_weight(i);
14736
+ struct ggml_tensor * tensor = weight->tensor;
14737
+ if (weight->idx != cur_split && params->keep_split) {
14738
+ close_ofstream();
14739
+ new_ofstream(weight->idx);
14740
+ }
14331
14741
 
14332
14742
  const std::string name = ggml_get_name(tensor);
14333
14743
 
@@ -14482,26 +14892,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14482
14892
  total_size_new += new_size;
14483
14893
 
14484
14894
  // update the gguf meta data as we go
14485
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
14486
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14895
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14896
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
14487
14897
 
14488
14898
  // write tensor data + padding
14489
14899
  fout.write((const char *) new_data, new_size);
14490
14900
  zeros(fout, GGML_PAD(new_size, align) - new_size);
14491
14901
  }
14492
-
14493
- // go back to beginning of file and write the updated meta data
14494
- {
14495
- fout.seekp(0);
14496
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
14497
- gguf_get_meta_data(ctx_out, data.data());
14498
- fout.write((const char *) data.data(), data.size());
14902
+ close_ofstream();
14903
+ for (auto & c:ctx_outs) {
14904
+ gguf_free(c);
14499
14905
  }
14500
14906
 
14501
- fout.close();
14502
-
14503
- gguf_free(ctx_out);
14504
-
14505
14907
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
14506
14908
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
14507
14909
 
@@ -14545,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
14545
14947
  std::unique_ptr<llama_model_loader> ml;
14546
14948
  if (path_base_model) {
14547
14949
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
14548
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
14950
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
14549
14951
  ml->init_mappings(/*prefetch*/ false); // no prefetching
14550
14952
  }
14551
14953
 
@@ -14804,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
14804
15206
  /*.vocab_only =*/ false,
14805
15207
  /*.use_mmap =*/ true,
14806
15208
  /*.use_mlock =*/ false,
15209
+ /*.check_tensors =*/ false,
14807
15210
  };
14808
15211
 
14809
15212
  #ifdef GGML_USE_METAL
@@ -14840,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
14840
15243
  /*.logits_all =*/ false,
14841
15244
  /*.embeddings =*/ false,
14842
15245
  /*.offload_kqv =*/ true,
15246
+ /*.flash_attn =*/ false,
14843
15247
  /*.abort_callback =*/ nullptr,
14844
15248
  /*.abort_callback_data =*/ nullptr,
14845
15249
  };
@@ -14857,6 +15261,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
14857
15261
  /*.quantize_output_tensor =*/ true,
14858
15262
  /*.only_copy =*/ false,
14859
15263
  /*.pure =*/ false,
15264
+ /*.keep_split =*/ false,
14860
15265
  /*.imatrix =*/ nullptr,
14861
15266
  /*.kv_overrides =*/ nullptr,
14862
15267
  };
@@ -15005,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
15005
15410
  cparams.defrag_thold = params.defrag_thold;
15006
15411
  cparams.embeddings = params.embeddings;
15007
15412
  cparams.offload_kqv = params.offload_kqv;
15413
+ cparams.flash_attn = params.flash_attn;
15008
15414
  cparams.pooling_type = params.pooling_type;
15009
15415
 
15010
15416
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -15012,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
15012
15418
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15013
15419
 
15014
15420
  // this is necessary due to kv_self.n being padded later during inference
15015
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
15421
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15016
15422
 
15017
15423
  // with causal attention, the batch size is limited by the context size
15018
15424
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
15019
- cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15020
15425
 
15426
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
15427
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
15428
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
15429
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
15430
+ LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
15431
+ cparams.n_batch = GGML_KQ_MASK_PAD;
15432
+ }
15433
+
15434
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15021
15435
 
15022
15436
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15023
15437
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
@@ -15049,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
15049
15463
  }
15050
15464
  }
15051
15465
 
15466
+ if (cparams.flash_attn && hparams.use_alibi) {
15467
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15468
+ cparams.flash_attn = false;
15469
+ }
15470
+
15471
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15472
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15473
+ cparams.flash_attn = false;
15474
+ }
15475
+
15476
+ #ifdef GGML_USE_HIPBLAS
15477
+ if (cparams.flash_attn) {
15478
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
15479
+ cparams.flash_attn = false;
15480
+ }
15481
+ #endif
15482
+
15052
15483
  if (params.seed == LLAMA_DEFAULT_SEED) {
15053
15484
  params.seed = time(NULL);
15054
15485
  }
@@ -15056,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
15056
15487
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
15057
15488
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
15058
15489
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
15490
+ LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
15059
15491
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
15060
15492
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
15061
15493
 
@@ -15184,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
15184
15616
  }
15185
15617
  ctx->backends.push_back(ctx->backend_cpu);
15186
15618
 
15187
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
15619
+ if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
15188
15620
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
15189
15621
  llama_free(ctx);
15190
15622
  return nullptr;
@@ -15365,6 +15797,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15365
15797
  case LLM_ARCH_QWEN2:
15366
15798
  case LLM_ARCH_QWEN2MOE:
15367
15799
  case LLM_ARCH_PHI2:
15800
+ case LLM_ARCH_PHI3:
15368
15801
  case LLM_ARCH_GEMMA:
15369
15802
  case LLM_ARCH_STARCODER2:
15370
15803
  return LLAMA_ROPE_TYPE_NEOX;
@@ -15378,6 +15811,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15378
15811
  return LLAMA_ROPE_TYPE_NONE;
15379
15812
  }
15380
15813
 
15814
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
15815
+ return ctx->cparams.pooling_type;
15816
+ }
15817
+
15381
15818
  int32_t llama_n_vocab(const struct llama_model * model) {
15382
15819
  return model->hparams.n_vocab;
15383
15820
  }
@@ -15778,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
15778
16215
  const size_t s_kv_head = sizeof(uint32_t);
15779
16216
  const size_t s_kv_size = sizeof(uint32_t);
15780
16217
  const size_t s_kv_used = sizeof(uint32_t);
16218
+ const size_t s_v_trans = sizeof(uint32_t);
15781
16219
  const size_t s_kv = ctx->kv_self.total_size();
15782
16220
  const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
15783
16221
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
@@ -15795,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
15795
16233
  + s_kv_head
15796
16234
  + s_kv_size
15797
16235
  + s_kv_used
16236
+ + s_v_trans
15798
16237
  + s_kv
15799
16238
  + s_kv_cells
15800
16239
  );
15801
16240
 
16241
+ // on session change it is very likely that the state size has changed - so we need to update this function
16242
+ static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
16243
+
15802
16244
  return s_total;
15803
16245
  }
15804
16246
 
@@ -15856,6 +16298,8 @@ struct llama_data_file_context : llama_data_context {
15856
16298
  *
15857
16299
  */
15858
16300
  static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
16301
+ llama_synchronize(ctx);
16302
+
15859
16303
  // copy rng
15860
16304
  {
15861
16305
  std::ostringstream rng_ss;
@@ -15942,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
15942
16386
  const uint32_t kv_size = kv_self.size;
15943
16387
  const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
15944
16388
  const uint32_t kv_used = kv_self.used;
16389
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
15945
16390
 
15946
16391
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
15947
16392
  data_ctx->write(&kv_head, sizeof(kv_head));
15948
16393
  data_ctx->write(&kv_size, sizeof(kv_size));
15949
16394
  data_ctx->write(&kv_used, sizeof(kv_used));
16395
+ data_ctx->write(&v_trans, sizeof(v_trans));
15950
16396
 
15951
16397
  if (kv_buf_size) {
15952
16398
  const size_t pre_kv_buf_size = data_ctx->get_size_written();
@@ -15959,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
15959
16405
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
15960
16406
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
15961
16407
 
15962
- if (kv_self.recurrent) {
16408
+ if (kv_self.recurrent || !kv_self.v_trans) {
15963
16409
  // v is contiguous for recurrent models
15964
16410
  // TODO: use other tensors for state models than k and v
15965
16411
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16008,6 +16454,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
16008
16454
 
16009
16455
  // Sets the state reading from the specified source address
16010
16456
  size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16457
+ llama_synchronize(ctx);
16458
+
16011
16459
  const uint8_t * inp = src;
16012
16460
 
16013
16461
  // set rng
@@ -16090,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16090
16538
  uint32_t kv_head;
16091
16539
  uint32_t kv_size;
16092
16540
  uint32_t kv_used;
16541
+ uint32_t v_trans;
16093
16542
 
16094
16543
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
16095
16544
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
16096
16545
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
16097
16546
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
16547
+ memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
16548
+
16549
+ GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
16098
16550
 
16099
16551
  if (kv_self.size != kv_size) {
16100
16552
  // the KV cache needs to be big enough to load all the KV cells from the saved state
@@ -16104,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16104
16556
  __func__, kv_head, kv_size, kv_self.size);
16105
16557
  }
16106
16558
 
16559
+ llama_kv_cache_clear(ctx);
16560
+
16107
16561
  if (kv_buf_size) {
16108
16562
  const size_t pre_kv_buf_size = inp - src;
16109
16563
 
@@ -16115,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16115
16569
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
16116
16570
  inp += k_size;
16117
16571
 
16118
- if (kv_self.recurrent) {
16572
+ if (kv_self.recurrent || !kv_self.v_trans) {
16119
16573
  // v is contiguous for recurrent models
16120
16574
  // TODO: use other tensors for state models than k and v
16121
16575
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16137,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16137
16591
  GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
16138
16592
  }
16139
16593
 
16140
- llama_kv_cache_clear(ctx);
16141
-
16142
16594
  ctx->kv_self.head = kv_head;
16143
16595
  ctx->kv_self.used = kv_used;
16144
16596
 
@@ -16312,6 +16764,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
16312
16764
  }
16313
16765
 
16314
16766
  static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16767
+ llama_synchronize(ctx);
16768
+
16315
16769
  const auto & kv_self = ctx->kv_self;
16316
16770
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16317
16771
 
@@ -16396,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16396
16850
  }
16397
16851
  }
16398
16852
 
16399
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16400
- const uint32_t kv_size = kv_self.size;
16401
- for (int il = 0; il < (int)n_layer; ++il) {
16402
- // Write value type
16403
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16404
- data_ctx.write(&v_type_i, sizeof(v_type_i));
16853
+ // TODO: simplify, reduce copy-paste
16854
+ if (!kv_self.v_trans) {
16855
+ for (int il = 0; il < (int)n_layer; ++il) {
16856
+ // Write value type
16857
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16858
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16405
16859
 
16406
- // Write element size
16407
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16408
- data_ctx.write(&v_size_el, sizeof(v_size_el));
16860
+ // Write row size of value
16861
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
16862
+ data_ctx.write(&v_size_row, sizeof(v_size_row));
16409
16863
 
16410
- // For each row, we get the element values of each cell
16411
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16412
- // Read each range of cells of v_size_el length each into tmp_buf and write out
16864
+ // Read each range of cells of v_size length each into tmp_buf and write out
16413
16865
  for (const auto & range : cell_ranges) {
16414
16866
  const size_t range_size = range.second - range.first;
16415
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16416
- tmp_buf.resize(range_size * v_size_el);
16417
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16867
+ tmp_buf.resize(range_size * v_size_row);
16868
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
16418
16869
  data_ctx.write(tmp_buf.data(), tmp_buf.size());
16419
16870
  }
16420
16871
  }
16872
+ } else {
16873
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16874
+ const uint32_t kv_size = kv_self.size;
16875
+ for (int il = 0; il < (int)n_layer; ++il) {
16876
+ // Write value type
16877
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16878
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16879
+
16880
+ // Write element size
16881
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16882
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16883
+
16884
+ // For each row, we get the element values of each cell
16885
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16886
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16887
+ for (const auto & range : cell_ranges) {
16888
+ const size_t range_size = range.second - range.first;
16889
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16890
+ tmp_buf.resize(range_size * v_size_el);
16891
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16892
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16893
+ }
16894
+ }
16895
+ }
16421
16896
  }
16422
16897
 
16423
16898
  return data_ctx.get_size_written();
@@ -16429,6 +16904,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
16429
16904
  }
16430
16905
 
16431
16906
  size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16907
+ llama_synchronize(ctx);
16908
+
16432
16909
  auto & kv_self = ctx->kv_self;
16433
16910
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16434
16911
 
@@ -16540,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
16540
17017
  }
16541
17018
  }
16542
17019
 
16543
- // For each layer, read the values for each cell (transposed)
16544
- for (int il = 0; il < (int)n_layer; ++il) {
16545
- // Read type of value
16546
- int32_t v_type_i_ref;
16547
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16548
- inp += sizeof(v_type_i_ref);
16549
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16550
- if (v_type_i != v_type_i_ref) {
16551
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16552
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16553
- return 0;
16554
- }
17020
+ // TODO: simplify, reduce copy-paste
17021
+ if (!kv_self.v_trans) {
17022
+ for (int il = 0; il < (int)n_layer; ++il) {
17023
+ // Read type of value
17024
+ int32_t v_type_i_ref;
17025
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17026
+ inp += sizeof(v_type_i_ref);
17027
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17028
+ if (v_type_i != v_type_i_ref) {
17029
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17030
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17031
+ return 0;
17032
+ }
16555
17033
 
16556
- // Read element size of value
16557
- size_t v_size_el_ref;
16558
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16559
- inp += sizeof(v_size_el_ref);
16560
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16561
- if (v_size_el != v_size_el_ref) {
16562
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16563
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16564
- return 0;
16565
- }
17034
+ // Read row size of value
17035
+ size_t v_size_row_ref;
17036
+ memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
17037
+ inp += sizeof(v_size_row_ref);
17038
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
17039
+ if (v_size_row != v_size_row_ref) {
17040
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17041
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
17042
+ return 0;
17043
+ }
16566
17044
 
16567
- if (cell_count) {
16568
- // For each row in the transposed matrix, read the values for the whole cell range
16569
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16570
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16571
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16572
- inp += cell_count * v_size_el;
17045
+ if (cell_count) {
17046
+ // Read and set the values for the whole cell range
17047
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
17048
+ inp += cell_count * v_size_row;
17049
+ }
17050
+ }
17051
+ } else {
17052
+ // For each layer, read the values for each cell (transposed)
17053
+ for (int il = 0; il < (int)n_layer; ++il) {
17054
+ // Read type of value
17055
+ int32_t v_type_i_ref;
17056
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17057
+ inp += sizeof(v_type_i_ref);
17058
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17059
+ if (v_type_i != v_type_i_ref) {
17060
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17061
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17062
+ return 0;
17063
+ }
17064
+
17065
+ // Read element size of value
17066
+ size_t v_size_el_ref;
17067
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
17068
+ inp += sizeof(v_size_el_ref);
17069
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
17070
+ if (v_size_el != v_size_el_ref) {
17071
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17072
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
17073
+ return 0;
17074
+ }
17075
+
17076
+ if (cell_count) {
17077
+ // For each row in the transposed matrix, read the values for the whole cell range
17078
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
17079
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
17080
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
17081
+ inp += cell_count * v_size_el;
17082
+ }
16573
17083
  }
16574
17084
  }
16575
17085
  }
16576
17086
 
16577
17087
  const size_t nread = inp - src;
17088
+
16578
17089
  return nread;
16579
17090
  }
16580
17091
 
@@ -16880,6 +17391,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
16880
17391
  return model->vocab.id_to_token[token].type;
16881
17392
  }
16882
17393
 
17394
+ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17395
+ return token != -1 && (
17396
+ token == llama_token_eos(model) ||
17397
+ token == llama_token_eot(model)
17398
+ );
17399
+ }
17400
+
16883
17401
  llama_token llama_token_bos(const struct llama_model * model) {
16884
17402
  return model->vocab.special_bos_id;
16885
17403
  }
@@ -16957,7 +17475,7 @@ static std::string llama_decode_text(const std::string & text) {
16957
17475
  }
16958
17476
 
16959
17477
  // does not write null-terminator to buf
16960
- int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
17478
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
16961
17479
  if (0 <= token && token < llama_n_vocab(model)) {
16962
17480
  switch (llama_vocab_get_type(model->vocab)) {
16963
17481
  case LLAMA_VOCAB_TYPE_WPM:
@@ -16972,7 +17490,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16972
17490
  }
16973
17491
  memcpy(buf, result.c_str(), result.length());
16974
17492
  return result.length();
16975
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17493
+ } else if (
17494
+ (llama_is_user_defined_token(model->vocab, token)) ||
17495
+ (llama_is_control_token (model->vocab, token) && special)) {
16976
17496
  std::string result = model->vocab.id_to_token[token].text;
16977
17497
  if (length < (int) result.length()) {
16978
17498
  return -(int) result.length();
@@ -16985,8 +17505,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16985
17505
  }
16986
17506
  memcpy(buf, "\xe2\x96\x85", 3);
16987
17507
  return 3;
16988
- } else if (llama_is_control_token(model->vocab, token)) {
16989
- ;
16990
17508
  } else if (llama_is_byte_token(model->vocab, token)) {
16991
17509
  if (length < 1) {
16992
17510
  return -1;
@@ -17007,15 +17525,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
17007
17525
  }
17008
17526
  memcpy(buf, result.c_str(), result.length());
17009
17527
  return result.length();
17010
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17528
+ } else if (
17529
+ (llama_is_user_defined_token(model->vocab, token)) ||
17530
+ (llama_is_control_token (model->vocab, token) && special)) {
17011
17531
  std::string result = model->vocab.id_to_token[token].text;
17012
17532
  if (length < (int) result.length()) {
17013
17533
  return -(int) result.length();
17014
17534
  }
17015
17535
  memcpy(buf, result.c_str(), result.length());
17016
17536
  return result.length();
17017
- } else if (llama_is_control_token(model->vocab, token)) {
17018
- ;
17019
17537
  }
17020
17538
  break;
17021
17539
  }
@@ -17213,6 +17731,24 @@ static int32_t llama_chat_apply_template_internal(
17213
17731
  if (add_ass) {
17214
17732
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
17733
  }
17734
+ } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
17735
+ // Llama 3
17736
+ for (auto message : chat) {
17737
+ std::string role(message->role);
17738
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
17739
+ }
17740
+ if (add_ass) {
17741
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17742
+ }
17743
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17744
+ // Phi 3
17745
+ for (auto message : chat) {
17746
+ std::string role(message->role);
17747
+ ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17748
+ }
17749
+ if (add_ass) {
17750
+ ss << "<|assistant|>\n";
17751
+ }
17216
17752
  } else {
17217
17753
  // template not supported
17218
17754
  return -1;
@@ -17345,6 +17881,11 @@ const char * llama_print_system_info(void) {
17345
17881
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
17346
17882
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17347
17883
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17884
+ #ifdef GGML_USE_LLAMAFILE
17885
+ s += "LLAMAFILE = 1 | ";
17886
+ #else
17887
+ s += "LLAMAFILE = 0 | ";
17888
+ #endif
17348
17889
 
17349
17890
  return s.c_str();
17350
17891
  }