llama_cpp 0.14.6 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,7 @@
75
75
  #include <forward_list>
76
76
  #include <fstream>
77
77
  #include <functional>
78
+ #include <future>
78
79
  #include <initializer_list>
79
80
  #include <locale>
80
81
  #include <map>
@@ -107,7 +108,6 @@
107
108
  #define LLAMA_MAX_NODES 8192
108
109
  #define LLAMA_MAX_EXPERTS 60
109
110
 
110
-
111
111
  //
112
112
  // logging
113
113
  //
@@ -211,6 +211,7 @@ enum llm_arch {
211
211
  LLM_ARCH_QWEN2,
212
212
  LLM_ARCH_QWEN2MOE,
213
213
  LLM_ARCH_PHI2,
214
+ LLM_ARCH_PHI3,
214
215
  LLM_ARCH_PLAMO,
215
216
  LLM_ARCH_CODESHELL,
216
217
  LLM_ARCH_ORION,
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
246
247
  { LLM_ARCH_QWEN2, "qwen2" },
247
248
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
248
249
  { LLM_ARCH_PHI2, "phi2" },
250
+ { LLM_ARCH_PHI3, "phi3" },
249
251
  { LLM_ARCH_PLAMO, "plamo" },
250
252
  { LLM_ARCH_CODESHELL, "codeshell" },
251
253
  { LLM_ARCH_ORION, "orion" },
@@ -314,6 +316,7 @@ enum llm_kv {
314
316
  LLM_KV_SSM_TIME_STEP_RANK,
315
317
 
316
318
  LLM_KV_TOKENIZER_MODEL,
319
+ LLM_KV_TOKENIZER_PRE,
317
320
  LLM_KV_TOKENIZER_LIST,
318
321
  LLM_KV_TOKENIZER_TOKEN_TYPE,
319
322
  LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@@ -390,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
390
393
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
391
394
 
392
395
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
396
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
393
397
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
394
398
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
395
399
  { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@@ -793,6 +797,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
793
797
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
794
798
  },
795
799
  },
800
+ {
801
+ LLM_ARCH_PHI3,
802
+ {
803
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
804
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
805
+ { LLM_TENSOR_OUTPUT, "output" },
806
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
807
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
808
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
809
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
810
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
811
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
812
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
813
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
814
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
815
+ },
816
+ },
796
817
  {
797
818
  LLM_ARCH_PLAMO,
798
819
  {
@@ -1600,12 +1621,12 @@ struct llama_mlock {
1600
1621
  };
1601
1622
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1602
1623
 
1603
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1624
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1604
1625
  std::vector<char> result(8, 0);
1605
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1626
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1606
1627
  if (n_tokens < 0) {
1607
1628
  result.resize(-n_tokens);
1608
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1629
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1609
1630
  GGML_ASSERT(check == -n_tokens);
1610
1631
  }
1611
1632
  else {
@@ -1824,7 +1845,7 @@ struct llama_hparams {
1824
1845
  float f_logit_scale = 0.0f;
1825
1846
 
1826
1847
  bool causal_attn = true;
1827
- bool need_kq_pos = false;
1848
+ bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1828
1849
 
1829
1850
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1830
1851
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1914,6 +1935,7 @@ struct llama_cparams {
1914
1935
  bool embeddings;
1915
1936
  bool causal_attn;
1916
1937
  bool offload_kqv;
1938
+ bool flash_attn;
1917
1939
 
1918
1940
  enum llama_pooling_type pooling_type;
1919
1941
 
@@ -2017,8 +2039,8 @@ struct llama_kv_cache {
2017
2039
  bool has_shift = false;
2018
2040
  bool do_defrag = false;
2019
2041
  bool do_copy = false;
2020
- // with recurrent state models, a cell can hold the state for more than one past token
2021
- bool recurrent = false;
2042
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
2043
+ bool v_trans = true; // the value tensor is transposed
2022
2044
 
2023
2045
  // Note: The value of head isn't only used to optimize searching
2024
2046
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2095,7 +2117,8 @@ struct llama_vocab {
2095
2117
  ttype type;
2096
2118
  };
2097
2119
 
2098
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2120
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2121
+ enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2099
2122
 
2100
2123
  std::unordered_map<token, id> token_to_id;
2101
2124
  std::vector<token_data> id_to_token;
@@ -2120,7 +2143,7 @@ struct llama_vocab {
2120
2143
  id special_prefix_id = -1;
2121
2144
  id special_suffix_id = -1;
2122
2145
  id special_middle_id = -1;
2123
- id special_eot_id = -1;
2146
+ id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2124
2147
 
2125
2148
  bool add_space_prefix = true;
2126
2149
 
@@ -2316,11 +2339,14 @@ struct llama_context {
2316
2339
 
2317
2340
  static bool llama_kv_cache_init(
2318
2341
  struct llama_kv_cache & cache,
2319
- const llama_model & model,
2342
+ const llama_context * ctx,
2320
2343
  ggml_type type_k,
2321
2344
  ggml_type type_v,
2322
2345
  uint32_t kv_size,
2323
2346
  bool offload) {
2347
+ const llama_model & model = ctx->model;
2348
+ const llama_cparams & cparams = ctx->cparams;
2349
+
2324
2350
  const struct llama_hparams & hparams = model.hparams;
2325
2351
 
2326
2352
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
@@ -2331,8 +2357,9 @@ static bool llama_kv_cache_init(
2331
2357
 
2332
2358
  // TODO: find a nicer way to add other recurrent model architectures
2333
2359
  cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2360
+ cache.v_trans = !cparams.flash_attn;
2334
2361
 
2335
- // TODO: support mixed reccurent Transformer architectues
2362
+ // TODO: support mixed recurrent Transformer architectures
2336
2363
  // NOTE: (!a || b) is a logical implication (a -> b)
2337
2364
  GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2338
2365
  GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@@ -2543,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2543
2570
  }
2544
2571
  cache.head = 0;
2545
2572
  cache.used = 0;
2573
+
2574
+ for (auto & buf : cache.bufs) {
2575
+ ggml_backend_buffer_clear(buf, 0);
2576
+ }
2546
2577
  }
2547
2578
 
2548
2579
  static bool llama_kv_cache_seq_rm(
@@ -2863,6 +2894,7 @@ namespace GGUFMeta {
2863
2894
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2864
2895
  case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2865
2896
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2897
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
2866
2898
  }
2867
2899
  return "unknown";
2868
2900
  }
@@ -2874,13 +2906,16 @@ namespace GGUFMeta {
2874
2906
  __func__, override_type_to_str(ovrd->tag), ovrd->key);
2875
2907
  switch (ovrd->tag) {
2876
2908
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2877
- LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2909
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
2878
2910
  } break;
2879
2911
  case LLAMA_KV_OVERRIDE_TYPE_INT: {
2880
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2912
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
2881
2913
  } break;
2882
2914
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2883
- LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2915
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
2916
+ } break;
2917
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
2918
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
2884
2919
  } break;
2885
2920
  default:
2886
2921
  // Shouldn't be possible to end up here, but just in case...
@@ -2899,7 +2934,7 @@ namespace GGUFMeta {
2899
2934
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2900
2935
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2901
2936
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2902
- target = ovrd->bool_value;
2937
+ target = ovrd->val_bool;
2903
2938
  return true;
2904
2939
  }
2905
2940
  return false;
@@ -2909,7 +2944,7 @@ namespace GGUFMeta {
2909
2944
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2910
2945
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2911
2946
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2912
- target = ovrd->int_value;
2947
+ target = ovrd->val_i64;
2913
2948
  return true;
2914
2949
  }
2915
2950
  return false;
@@ -2919,7 +2954,7 @@ namespace GGUFMeta {
2919
2954
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2920
2955
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2921
2956
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2922
- target = ovrd->float_value;
2957
+ target = ovrd->val_f64;
2923
2958
  return true;
2924
2959
  }
2925
2960
  return false;
@@ -2928,12 +2963,11 @@ namespace GGUFMeta {
2928
2963
  template<typename OT>
2929
2964
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2930
2965
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2931
- (void)target;
2932
- (void)ovrd;
2933
- if (!ovrd) { return false; }
2934
- // Currently, we should never end up here so it would be a bug if we do.
2935
- throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2936
- ovrd ? ovrd->key : "NULL"));
2966
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
2967
+ target = ovrd->val_str;
2968
+ return true;
2969
+ }
2970
+ return false;
2937
2971
  }
2938
2972
 
2939
2973
  static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@@ -2966,6 +3000,7 @@ struct llama_model_loader {
2966
3000
  size_t n_bytes = 0;
2967
3001
 
2968
3002
  bool use_mmap = false;
3003
+ bool check_tensors;
2969
3004
 
2970
3005
  llama_files files;
2971
3006
  llama_ftype ftype;
@@ -2980,9 +3015,13 @@ struct llama_model_loader {
2980
3015
 
2981
3016
  ggml_tensor * tensor;
2982
3017
 
2983
- llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
3018
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2984
3019
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2985
3020
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
3021
+
3022
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
3023
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
3024
+ }
2986
3025
  }
2987
3026
  };
2988
3027
  std::vector<llama_tensor_weight> weights;
@@ -2995,7 +3034,7 @@ struct llama_model_loader {
2995
3034
  std::string arch_name;
2996
3035
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2997
3036
 
2998
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
3037
+ llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
2999
3038
  int trace = 0;
3000
3039
  if (getenv("LLAMA_TRACE")) {
3001
3040
  trace = atoi(getenv("LLAMA_TRACE"));
@@ -3021,15 +3060,15 @@ struct llama_model_loader {
3021
3060
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
3022
3061
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
3023
3062
 
3063
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
3064
+ contexts.emplace_back(ctx);
3065
+
3024
3066
  // Save tensors data offset of the main file.
3025
3067
  // For subsidiary files, `meta` tensor data offset must not be used,
3026
3068
  // so we build a unified tensors index for weights.
3027
3069
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3028
- weights.emplace_back(0, cur->name, meta, cur);
3070
+ weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
3029
3071
  }
3030
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
3031
- contexts.emplace_back(ctx);
3032
-
3033
3072
  uint16_t n_split = 0;
3034
3073
  get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
3035
3074
 
@@ -3063,12 +3102,13 @@ struct llama_model_loader {
3063
3102
  throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
3064
3103
  }
3065
3104
 
3105
+ files.emplace_back(new llama_file(split_path, "rb"));
3106
+ contexts.emplace_back(ctx);
3107
+
3066
3108
  // Save tensors data offset info of the shard.
3067
3109
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3068
- weights.emplace_back(idx, cur->name, ctx_gguf, cur);
3110
+ weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
3069
3111
  }
3070
- files.emplace_back(new llama_file(split_path, "rb"));
3071
- contexts.emplace_back(ctx);
3072
3112
 
3073
3113
  gguf_free(ctx_gguf);
3074
3114
  }
@@ -3091,9 +3131,17 @@ struct llama_model_loader {
3091
3131
 
3092
3132
  fver = (enum llama_fver) gguf_get_version(meta);
3093
3133
 
3134
+ std::set<std::string> tensor_names;
3094
3135
  for (auto & w : weights) {
3095
3136
  n_elements += ggml_nelements(w.tensor);
3096
3137
  n_bytes += ggml_nbytes(w.tensor);
3138
+ // make sure there is no duplicated tensor names
3139
+ const std::string name(w.tensor->name);
3140
+ auto found = tensor_names.find(name);
3141
+ if (found != tensor_names.end()) {
3142
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
3143
+ }
3144
+ tensor_names.insert(name);
3097
3145
  }
3098
3146
 
3099
3147
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -3199,6 +3247,7 @@ struct llama_model_loader {
3199
3247
  }
3200
3248
 
3201
3249
  this->use_mmap = use_mmap;
3250
+ this->check_tensors = check_tensors;
3202
3251
  }
3203
3252
 
3204
3253
  ~llama_model_loader() {
@@ -3278,6 +3327,10 @@ struct llama_model_loader {
3278
3327
  return nullptr;
3279
3328
  }
3280
3329
 
3330
+ const llama_tensor_weight * get_weight(int i) const {
3331
+ return get_weight(get_tensor_name(i));
3332
+ }
3333
+
3281
3334
  const llama_tensor_weight & require_weight(const char * name) const {
3282
3335
  const llama_tensor_weight * weight = get_weight(name);
3283
3336
  if (!weight) {
@@ -3453,6 +3506,10 @@ struct llama_model_loader {
3453
3506
  file->seek(w.offs, SEEK_SET);
3454
3507
  file->read_raw(cur->data, ggml_nbytes(cur));
3455
3508
  }
3509
+
3510
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
3511
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3512
+ }
3456
3513
  }
3457
3514
 
3458
3515
  size_t size_done = 0;
@@ -3469,6 +3526,8 @@ struct llama_model_loader {
3469
3526
  GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3470
3527
 
3471
3528
  std::vector<no_init<uint8_t>> read_buf;
3529
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3530
+
3472
3531
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3473
3532
  const auto * weight = get_weight(ggml_get_name(cur));
3474
3533
  if (weight == nullptr) {
@@ -3490,37 +3549,66 @@ struct llama_model_loader {
3490
3549
  if (bufs_mmap.count(weight->idx)) {
3491
3550
  buf_mmap = bufs_mmap.at(weight->idx);
3492
3551
  }
3552
+ uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
3553
+
3554
+ if (check_tensors) {
3555
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
3556
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
3557
+ }));
3558
+ }
3559
+
3493
3560
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3494
3561
  if (buf_mmap && cur->data == nullptr) {
3495
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3562
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
3496
3563
  if (lmlocks) {
3497
3564
  const auto & lmlock = lmlocks->at(weight->idx);
3498
- lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3565
+ lmlock->grow_to(weight->offs + n_size);
3499
3566
  }
3500
3567
 
3501
3568
  auto & mmap_used = mmaps_used[weight->idx];
3502
3569
  mmap_used.first = std::min(mmap_used.first, weight->offs);
3503
3570
  mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3504
3571
  } else {
3505
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3572
+ ggml_backend_tensor_set(cur, data, 0, n_size);
3506
3573
  }
3507
3574
  } else {
3508
3575
  GGML_ASSERT(weight->idx < files.size());
3509
3576
  const auto & file = files.at(weight->idx);
3510
3577
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3511
3578
  file->seek(weight->offs, SEEK_SET);
3512
- file->read_raw(cur->data, ggml_nbytes(cur));
3579
+ file->read_raw(cur->data, n_size);
3580
+ if (check_tensors) {
3581
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3582
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
3583
+ }));
3584
+ }
3513
3585
  } else {
3514
- read_buf.resize(ggml_nbytes(cur));
3586
+ read_buf.resize(n_size);
3515
3587
  file->seek(weight->offs, SEEK_SET);
3516
- file->read_raw(read_buf.data(), ggml_nbytes(cur));
3588
+ file->read_raw(read_buf.data(), n_size);
3517
3589
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3590
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3591
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3592
+ }
3518
3593
  }
3519
3594
  }
3520
3595
 
3521
3596
  size_done += n_size;
3522
3597
  }
3523
3598
 
3599
+ // check validation results
3600
+ bool validation_failed = false;
3601
+ for (auto & future : validation_result) {
3602
+ auto result = future.get();
3603
+ if (!result.second) {
3604
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
3605
+ validation_failed = true;
3606
+ }
3607
+ }
3608
+ if (validation_failed) {
3609
+ throw std::runtime_error("found tensors with invalid data");
3610
+ }
3611
+
3524
3612
  // check if this is the last call and do final cleanup
3525
3613
  if (size_done >= size_data) {
3526
3614
  // unmap offloaded tensors and metadata
@@ -3770,7 +3858,7 @@ static void llm_load_hparams(
3770
3858
  switch (hparams.n_layer) {
3771
3859
  case 22: model.type = e_model::MODEL_1B; break;
3772
3860
  case 26: model.type = e_model::MODEL_3B; break;
3773
- case 32: model.type = e_model::MODEL_7B; break;
3861
+ case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3774
3862
  case 40: model.type = e_model::MODEL_13B; break;
3775
3863
  case 48: model.type = e_model::MODEL_34B; break;
3776
3864
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3955,6 +4043,16 @@ static void llm_load_hparams(
3955
4043
  {
3956
4044
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3957
4045
 
4046
+ switch (hparams.n_layer) {
4047
+ case 24: model.type = e_model::MODEL_1B; break;
4048
+ case 32: model.type = e_model::MODEL_3B; break;
4049
+ default: model.type = e_model::MODEL_UNKNOWN;
4050
+ }
4051
+ } break;
4052
+ case LLM_ARCH_PHI3:
4053
+ {
4054
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4055
+
3958
4056
  switch (hparams.n_layer) {
3959
4057
  case 24: model.type = e_model::MODEL_1B; break;
3960
4058
  case 32: model.type = e_model::MODEL_3B; break;
@@ -4104,7 +4202,7 @@ static void llm_load_hparams(
4104
4202
  model.ftype = ml.ftype;
4105
4203
 
4106
4204
  if (hparams.f_max_alibi_bias > 0.0f) {
4107
- hparams.need_kq_pos = true;
4205
+ hparams.use_alibi = true;
4108
4206
  }
4109
4207
 
4110
4208
  hparams.rope_type = llama_rope_type(&model);
@@ -4127,11 +4225,13 @@ static void llm_load_vocab(
4127
4225
 
4128
4226
  // determine vocab type
4129
4227
  {
4130
- std::string tokenizer_name;
4228
+ std::string tokenizer_model;
4229
+ std::string tokenizer_pre;
4131
4230
 
4132
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
4231
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
4232
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4133
4233
 
4134
- if (tokenizer_name == "no_vocab") {
4234
+ if (tokenizer_model == "no_vocab") {
4135
4235
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
4136
4236
 
4137
4237
  // default special tokens
@@ -4145,7 +4245,7 @@ static void llm_load_vocab(
4145
4245
  vocab.linefeed_id = -1;
4146
4246
 
4147
4247
  return;
4148
- } else if (tokenizer_name == "llama") {
4248
+ } else if (tokenizer_model == "llama") {
4149
4249
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4150
4250
 
4151
4251
  // default special tokens
@@ -4179,7 +4279,10 @@ static void llm_load_vocab(
4179
4279
  vocab.special_prefix_id = 67;
4180
4280
  vocab.special_suffix_id = 69;
4181
4281
  vocab.special_middle_id = 68;
4182
- vocab.special_eot_id = 70;
4282
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4283
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4284
+ //vocab.special_eot_id = 70;
4285
+ vocab.special_eot_id = 107;
4183
4286
  }
4184
4287
  }
4185
4288
 
@@ -4187,9 +4290,27 @@ static void llm_load_vocab(
4187
4290
  if (add_space_prefix_keyidx != -1) {
4188
4291
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4189
4292
  } // The default value of add_space_prefix is true.
4190
- } else if (tokenizer_name == "gpt2") {
4191
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4293
+ } else if (tokenizer_model == "bert") {
4294
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4192
4295
 
4296
+ // default special tokens
4297
+ vocab.special_bos_id = -1;
4298
+ vocab.special_eos_id = -1;
4299
+ vocab.special_unk_id = 100;
4300
+ vocab.special_sep_id = 102;
4301
+ vocab.special_pad_id = 0;
4302
+ vocab.special_cls_id = 101;
4303
+ vocab.special_mask_id = 103;
4304
+ vocab.add_space_prefix = false;
4305
+ } else {
4306
+ if (tokenizer_model == "gpt2") {
4307
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4308
+ } else {
4309
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4310
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4311
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
4312
+ return;
4313
+ }
4193
4314
  // read bpe merges and populate bpe ranks
4194
4315
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4195
4316
  if (merges_keyidx == -1) {
@@ -4223,23 +4344,50 @@ static void llm_load_vocab(
4223
4344
  vocab.special_pad_id = -1;
4224
4345
  vocab.special_cls_id = -1;
4225
4346
  vocab.special_mask_id = -1;
4226
- } else if (tokenizer_name == "bert") {
4227
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4347
+ }
4228
4348
 
4229
- // default special tokens
4230
- vocab.special_bos_id = -1;
4231
- vocab.special_eos_id = -1;
4232
- vocab.special_unk_id = 100;
4233
- vocab.special_sep_id = 102;
4234
- vocab.special_pad_id = 0;
4235
- vocab.special_cls_id = 101;
4236
- vocab.special_mask_id = 103;
4237
- vocab.add_space_prefix = false;
4349
+ // for now, only BPE models have pre-tokenizers
4350
+ if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4351
+ if (tokenizer_pre.empty()) {
4352
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4353
+ LLAMA_LOG_WARN("%s: \n", __func__);
4354
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4355
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4356
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4357
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4358
+ LLAMA_LOG_WARN("%s: \n", __func__);
4359
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4360
+ } else if (
4361
+ tokenizer_pre == "default") {
4362
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4363
+ } else if (
4364
+ tokenizer_pre == "llama3" ||
4365
+ tokenizer_pre == "llama-v3" ||
4366
+ tokenizer_pre == "llama-bpe") {
4367
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4368
+ } else if (
4369
+ tokenizer_pre == "deepseek-llm") {
4370
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4371
+ } else if (
4372
+ tokenizer_pre == "deepseek-coder") {
4373
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4374
+ } else if (
4375
+ tokenizer_pre == "falcon") {
4376
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4377
+ } else if (
4378
+ tokenizer_pre == "mpt") {
4379
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4380
+ } else if (
4381
+ tokenizer_pre == "starcoder") {
4382
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4383
+ } else if (
4384
+ tokenizer_pre == "gpt-2") {
4385
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386
+ } else {
4387
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4388
+ }
4238
4389
  } else {
4239
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4240
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4241
-
4242
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4243
4391
  }
4244
4392
  }
4245
4393
 
@@ -4308,6 +4456,7 @@ static void llm_load_vocab(
4308
4456
  { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
4457
  { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4310
4458
  };
4459
+
4311
4460
  for (const auto & it : special_token_types) {
4312
4461
  const std::string & key = kv(std::get<0>(it));
4313
4462
  int32_t & id = std::get<1>(it);
@@ -4322,7 +4471,6 @@ static void llm_load_vocab(
4322
4471
  } else {
4323
4472
  id = new_id;
4324
4473
  }
4325
-
4326
4474
  }
4327
4475
 
4328
4476
  // Handle add_bos_token and add_eos_token
@@ -4336,6 +4484,28 @@ static void llm_load_vocab(
4336
4484
  vocab.special_add_eos = int(temp);
4337
4485
  }
4338
4486
  }
4487
+
4488
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
4489
+ //
4490
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
4491
+ // for now, we apply this workaround to find the EOT token based on its text
4492
+ if (vocab.special_eot_id == -1) {
4493
+ for (const auto & t : vocab.token_to_id) {
4494
+ if (
4495
+ // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
4496
+ // need to fix convert script
4497
+ //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
4498
+ (t.first == "<|eot_id|>" ||
4499
+ t.first == "<|im_end|>" ||
4500
+ t.first == "<|end|>" ||
4501
+ t.first == "<end_of_turn>"
4502
+ )
4503
+ ) {
4504
+ vocab.special_eot_id = t.second;
4505
+ break;
4506
+ }
4507
+ }
4508
+ }
4339
4509
  }
4340
4510
 
4341
4511
  // build special tokens cache
@@ -4498,14 +4668,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4498
4668
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4499
4669
 
4500
4670
  // special tokens
4501
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4502
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4503
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4504
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4505
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4506
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4507
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4508
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4671
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4672
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4673
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4674
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4675
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4676
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4677
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4678
+
4679
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4680
+ if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
4681
+ if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4682
+ if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4683
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4509
4684
  }
4510
4685
 
4511
4686
  // Returns false if cancelled by progress_callback
@@ -5346,6 +5521,33 @@ static bool llm_load_tensors(
5346
5521
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5347
5522
  }
5348
5523
  } break;
5524
+ case LLM_ARCH_PHI3:
5525
+ {
5526
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
5527
+
5528
+ // output
5529
+ {
5530
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
5531
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
5532
+ }
5533
+
5534
+ for (int i = 0; i < n_layer; ++i) {
5535
+ ggml_context* ctx_layer = ctx_for_layer(i);
5536
+ ggml_context* ctx_split = ctx_for_layer_split(i);
5537
+
5538
+ auto& layer = model.layers[i];
5539
+
5540
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5541
+
5542
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5543
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5544
+
5545
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5546
+
5547
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5548
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5549
+ }
5550
+ } break;
5349
5551
  case LLM_ARCH_PLAMO:
5350
5552
  {
5351
5553
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5880,7 +6082,7 @@ static bool llm_load_tensors(
5880
6082
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
5881
6083
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
5882
6084
  try {
5883
- llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
6085
+ llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
5884
6086
 
5885
6087
  model.hparams.vocab_only = params.vocab_only;
5886
6088
 
@@ -6009,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
6009
6211
  static void llm_build_kv_store(
6010
6212
  struct ggml_context * ctx,
6011
6213
  const llama_hparams & hparams,
6214
+ const llama_cparams & cparams,
6012
6215
  const llama_kv_cache & kv,
6013
6216
  struct ggml_cgraph * graph,
6014
6217
  struct ggml_tensor * k_cur,
6015
6218
  struct ggml_tensor * v_cur,
6016
- int64_t n_ctx,
6017
6219
  int32_t n_tokens,
6018
6220
  int32_t kv_head,
6019
6221
  const llm_build_cb & cb,
6020
6222
  int64_t il) {
6223
+ const int64_t n_ctx = cparams.n_ctx;
6224
+
6021
6225
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6022
6226
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6023
6227
 
6024
6228
  GGML_ASSERT(kv.size == n_ctx);
6025
6229
 
6026
- // compute the transposed [n_tokens, n_embd] V matrix
6027
- assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6028
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
6029
- cb(v_cur_t, "v_cur_t", il);
6030
-
6031
6230
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
6032
6231
  (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
6033
6232
  cb(k_cache_view, "k_cache_view", il);
6034
6233
 
6035
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6036
- ( n_ctx)*ggml_element_size(kv.v_l[il]),
6037
- (kv_head)*ggml_element_size(kv.v_l[il]));
6234
+ // note: storing RoPE-ed version of K in the KV cache
6235
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6236
+
6237
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6238
+
6239
+ struct ggml_tensor * v_cache_view = nullptr;
6240
+
6241
+ if (cparams.flash_attn) {
6242
+ v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
6243
+ (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
6244
+ } else {
6245
+ // note: the V cache is transposed when not using flash attention
6246
+ v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6247
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
6248
+ (kv_head)*ggml_element_size(kv.v_l[il]));
6249
+
6250
+ v_cur = ggml_transpose(ctx, v_cur);
6251
+ }
6038
6252
  cb(v_cache_view, "v_cache_view", il);
6039
6253
 
6040
- // important: storing RoPE-ed version of K in the KV cache!
6041
- ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6042
- ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
6254
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
6043
6255
  }
6044
6256
 
6045
6257
  static struct ggml_tensor * llm_build_norm(
@@ -6259,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
6259
6471
  return moe_out;
6260
6472
  }
6261
6473
 
6262
- // if max_alibi_bias > 0 then apply ALiBi
6263
6474
  static struct ggml_tensor * llm_build_kqv(
6264
6475
  struct ggml_context * ctx,
6265
6476
  const llama_model & model,
6266
6477
  const llama_hparams & hparams,
6478
+ const llama_cparams & cparams,
6267
6479
  const llama_kv_cache & kv,
6268
6480
  struct ggml_cgraph * graph,
6269
6481
  struct ggml_tensor * wo,
@@ -6271,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
6271
6483
  struct ggml_tensor * q_cur,
6272
6484
  struct ggml_tensor * kq_mask,
6273
6485
  struct ggml_tensor * kq_pos,
6274
- int64_t n_ctx,
6275
6486
  int32_t n_tokens,
6276
6487
  int32_t n_kv,
6277
6488
  float kq_scale,
6278
6489
  const llm_build_cb & cb,
6279
6490
  int il) {
6491
+ const int64_t n_ctx = cparams.n_ctx;
6280
6492
  const int64_t n_head = hparams.n_head;
6281
6493
  const int64_t n_head_kv = hparams.n_head_kv;
6282
6494
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -6294,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
6294
6506
  0);
6295
6507
  cb(k, "k", il);
6296
6508
 
6297
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6298
- cb(kq, "kq", il);
6509
+ struct ggml_tensor * cur;
6299
6510
 
6300
- if (model.arch == LLM_ARCH_PHI2) {
6301
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6302
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6303
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6304
- }
6511
+ if (cparams.flash_attn) {
6512
+ GGML_UNUSED(model);
6513
+ GGML_UNUSED(n_ctx);
6305
6514
 
6306
- if (model.arch == LLM_ARCH_GROK) {
6307
- // need to do the following:
6308
- // multiply by attn_output_multiplyer of 0.08838834764831845
6309
- // and then :
6310
- // kq = 30 * tanh(kq / 30)
6311
- // before the softmax below
6515
+ // note: if this assert triggers, then some check has failed earlier
6516
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6517
+ GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6312
6518
 
6313
- //try from phi2
6314
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6519
+ // split cached v into n_head heads (not transposed)
6520
+ struct ggml_tensor * v =
6521
+ ggml_view_3d(ctx, kv.v_l[il],
6522
+ n_embd_head_v, n_kv, n_head_kv,
6523
+ ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6524
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6525
+ 0);
6526
+ cb(v, "v", il);
6315
6527
 
6316
- kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6317
- kq = ggml_scale(ctx, kq, 30);
6318
- }
6528
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6529
+
6530
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6531
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6532
+ }
6533
+
6534
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6535
+ } else {
6536
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6537
+ cb(kq, "kq", il);
6538
+
6539
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6540
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6541
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6542
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6543
+ }
6544
+
6545
+ if (model.arch == LLM_ARCH_GROK) {
6546
+ // need to do the following:
6547
+ // multiply by attn_output_multiplyer of 0.08838834764831845
6548
+ // and then :
6549
+ // kq = 30 * tanh(kq / 30)
6550
+ // before the softmax below
6551
+
6552
+ //try from phi2
6553
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6554
+
6555
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6556
+ kq = ggml_scale(ctx, kq, 30);
6557
+ }
6319
6558
 
6320
6559
  #if defined(GGML_USE_KOMPUTE)
6321
6560
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6322
6561
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6323
6562
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6324
- if (hparams.f_max_alibi_bias > 0.0f) {
6325
- kq = ggml_scale(ctx, kq, kq_scale);
6326
- cb(kq, "kq_scaled", il);
6563
+ if (hparams.use_alibi) {
6564
+ kq = ggml_scale(ctx, kq, kq_scale);
6565
+ cb(kq, "kq_scaled", il);
6327
6566
 
6328
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6329
- cb(kq, "kq_scaled_alibi", il);
6567
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6568
+ cb(kq, "kq_scaled_alibi", il);
6330
6569
 
6331
- kq = ggml_add(ctx, kq, kq_mask);
6332
- cb(kq, "kq_masked", il);
6570
+ kq = ggml_add(ctx, kq, kq_mask);
6571
+ cb(kq, "kq_masked", il);
6333
6572
 
6334
- kq = ggml_soft_max(ctx, kq);
6335
- cb(kq, "kq_soft_max", il);
6336
- } else
6573
+ kq = ggml_soft_max(ctx, kq);
6574
+ cb(kq, "kq_soft_max", il);
6575
+ } else
6337
6576
  #endif
6338
- {
6339
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6340
- cb(kq, "kq_soft_max_ext", il);
6341
- }
6577
+ {
6578
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6579
+ cb(kq, "kq_soft_max_ext", il);
6580
+ }
6342
6581
 
6343
- GGML_ASSERT(kv.size == n_ctx);
6582
+ GGML_ASSERT(kv.size == n_ctx);
6344
6583
 
6345
- // split cached v into n_head heads
6346
- struct ggml_tensor * v =
6347
- ggml_view_3d(ctx, kv.v_l[il],
6348
- n_kv, n_embd_head_v, n_head_kv,
6349
- ggml_element_size(kv.v_l[il])*n_ctx,
6350
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6351
- 0);
6352
- cb(v, "v", il);
6584
+ // split cached v into n_head heads
6585
+ struct ggml_tensor * v =
6586
+ ggml_view_3d(ctx, kv.v_l[il],
6587
+ n_kv, n_embd_head_v, n_head_kv,
6588
+ ggml_element_size(kv.v_l[il])*n_ctx,
6589
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6590
+ 0);
6591
+ cb(v, "v", il);
6353
6592
 
6354
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6355
- cb(kqv, "kqv", il);
6593
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6594
+ cb(kqv, "kqv", il);
6356
6595
 
6357
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6358
- cb(kqv_merged, "kqv_merged", il);
6596
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6597
+ cb(kqv_merged, "kqv_merged", il);
6359
6598
 
6360
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6361
- cb(cur, "kqv_merged_cont", il);
6599
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6600
+ cb(cur, "kqv_merged_cont", il);
6601
+ }
6362
6602
 
6363
6603
  ggml_build_forward_expand(graph, cur);
6364
6604
 
@@ -6378,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
6378
6618
  struct ggml_context * ctx,
6379
6619
  const llama_model & model,
6380
6620
  const llama_hparams & hparams,
6621
+ const llama_cparams & cparams,
6381
6622
  const llama_kv_cache & kv,
6382
6623
  struct ggml_cgraph * graph,
6383
6624
  struct ggml_tensor * wo,
@@ -6387,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
6387
6628
  struct ggml_tensor * q_cur,
6388
6629
  struct ggml_tensor * kq_mask,
6389
6630
  struct ggml_tensor * kq_pos,
6390
- int64_t n_ctx,
6391
6631
  int32_t n_tokens,
6392
6632
  int32_t kv_head,
6393
6633
  int32_t n_kv,
@@ -6401,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
6401
6641
  ggml_build_forward_expand(graph, k_cur);
6402
6642
  ggml_build_forward_expand(graph, v_cur);
6403
6643
 
6404
- llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
6644
+ llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
6405
6645
 
6406
6646
  struct ggml_tensor * cur;
6407
6647
 
6408
- cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
6409
- q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
6648
+ cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6649
+ q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6410
6650
  cb(cur, "kqv_out", il);
6411
6651
 
6412
6652
  return cur;
@@ -6448,6 +6688,8 @@ struct llm_build_context {
6448
6688
  const int32_t kv_head; // index of where we store new KV data in the cache
6449
6689
  const int32_t n_orig_ctx;
6450
6690
 
6691
+ const bool flash_attn;
6692
+
6451
6693
  const enum llama_pooling_type pooling_type;
6452
6694
  const enum llama_rope_type rope_type;
6453
6695
 
@@ -6494,6 +6736,7 @@ struct llm_build_context {
6494
6736
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6495
6737
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6496
6738
  n_orig_ctx (cparams.n_yarn_orig_ctx),
6739
+ flash_attn (cparams.flash_attn),
6497
6740
  pooling_type (cparams.pooling_type),
6498
6741
  rope_type (hparams.rope_type),
6499
6742
  cb (cb),
@@ -6608,15 +6851,31 @@ struct llm_build_context {
6608
6851
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6609
6852
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
6610
6853
 
6611
- ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6612
- nm, n_embd_v_gqa,
6613
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6614
- ggml_row_size(kv_self.v_l[il]->type, i));
6854
+ ggml_tensor * view_v_src;
6855
+ ggml_tensor * view_v_dst;
6615
6856
 
6616
- ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6617
- nm, n_embd_v_gqa,
6618
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6619
- ggml_row_size(kv_self.v_l[il]->type, id));
6857
+ if (flash_attn) {
6858
+ // NOTE: the V cache is not transposed when using flash attention
6859
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6860
+ n_embd_v_gqa, nm,
6861
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6862
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
6863
+
6864
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6865
+ n_embd_v_gqa, nm,
6866
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6867
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
6868
+ } else {
6869
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6870
+ nm, n_embd_v_gqa,
6871
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6872
+ ggml_row_size(kv_self.v_l[il]->type, i));
6873
+
6874
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6875
+ nm, n_embd_v_gqa,
6876
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6877
+ ggml_row_size(kv_self.v_l[il]->type, id));
6878
+ }
6620
6879
 
6621
6880
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
6622
6881
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
@@ -6646,20 +6905,26 @@ struct llm_build_context {
6646
6905
 
6647
6906
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
6648
6907
  if (causal) {
6649
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
6908
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6650
6909
  } else {
6651
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6910
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6652
6911
  }
6653
6912
  cb(lctx.inp_KQ_mask, "KQ_mask", -1);
6654
6913
  ggml_set_input(lctx.inp_KQ_mask);
6655
- return lctx.inp_KQ_mask;
6914
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6656
6915
  }
6657
6916
 
6658
- struct ggml_tensor * build_inp_KQ_pos() {
6659
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6917
+ struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6918
+ if (causal) {
6919
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6920
+ } else {
6921
+ // TODO: this will be needed for ALiBi-based BERT models
6922
+ // https://github.com/ggerganov/llama.cpp/pull/6826
6923
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6924
+ }
6660
6925
  cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6661
6926
  ggml_set_input(lctx.inp_KQ_pos);
6662
- return lctx.inp_KQ_pos;
6927
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6663
6928
  }
6664
6929
 
6665
6930
  struct ggml_tensor * build_inp_mean() {
@@ -6765,9 +7030,9 @@ struct llm_build_context {
6765
7030
  );
6766
7031
  cb(Kcur, "Kcur", il);
6767
7032
 
6768
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7033
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6769
7034
  model.layers[il].wo, model.layers[il].bo,
6770
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7035
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6771
7036
  }
6772
7037
 
6773
7038
  if (il == n_layer - 1) {
@@ -6905,9 +7170,9 @@ struct llm_build_context {
6905
7170
  cb(Qcur, "Qcur", il);
6906
7171
  cb(Kcur, "Kcur", il);
6907
7172
 
6908
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7173
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6909
7174
  model.layers[il].wo, NULL,
6910
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7175
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6911
7176
  }
6912
7177
 
6913
7178
  if (il == n_layer - 1) {
@@ -7012,9 +7277,9 @@ struct llm_build_context {
7012
7277
  ext_factor, attn_factor, beta_fast, beta_slow
7013
7278
  );
7014
7279
  cb(Kcur, "Kcur", il);
7015
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7016
7281
  model.layers[il].wo, NULL,
7017
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7282
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7018
7283
  }
7019
7284
 
7020
7285
  if (il == n_layer - 1) {
@@ -7132,9 +7397,9 @@ struct llm_build_context {
7132
7397
  );
7133
7398
  cb(Kcur, "Kcur", il);
7134
7399
 
7135
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7400
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7136
7401
  model.layers[il].wo, NULL,
7137
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7402
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7138
7403
  }
7139
7404
 
7140
7405
  if (il == n_layer - 1) {
@@ -7257,9 +7522,9 @@ struct llm_build_context {
7257
7522
  );
7258
7523
  cb(Kcur, "Kcur", il);
7259
7524
 
7260
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7525
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7261
7526
  model.layers[il].wo, model.layers[il].bo,
7262
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7527
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7263
7528
  }
7264
7529
 
7265
7530
  if (il == n_layer - 1) {
@@ -7409,9 +7674,9 @@ struct llm_build_context {
7409
7674
  );
7410
7675
  cb(Kcur, "Kcur", il);
7411
7676
 
7412
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7413
- model.layers[il].wo, NULL,
7414
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7677
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7678
+ model.layers[il].wo, NULL,
7679
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7415
7680
  }
7416
7681
 
7417
7682
  if (il == n_layer - 1) {
@@ -7521,9 +7786,9 @@ struct llm_build_context {
7521
7786
 
7522
7787
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7523
7788
 
7524
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7789
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7525
7790
  model.layers[il].wo, model.layers[il].bo,
7526
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7791
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7527
7792
  }
7528
7793
 
7529
7794
  if (il == n_layer - 1) {
@@ -7725,9 +7990,9 @@ struct llm_build_context {
7725
7990
  );
7726
7991
  cb(Vcur, "Vcur", il);
7727
7992
 
7728
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7993
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7729
7994
  model.layers[il].wo, model.layers[il].bo,
7730
- Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7995
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7731
7996
  }
7732
7997
 
7733
7998
  if (il == n_layer - 1) {
@@ -7821,9 +8086,9 @@ struct llm_build_context {
7821
8086
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7822
8087
  cb(Qcur, "Qcur", il);
7823
8088
 
7824
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8089
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7825
8090
  model.layers[il].wo, NULL,
7826
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8091
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7827
8092
  }
7828
8093
 
7829
8094
  if (il == n_layer - 1) {
@@ -8114,9 +8379,9 @@ struct llm_build_context {
8114
8379
 
8115
8380
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8116
8381
 
8117
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8382
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8118
8383
  model.layers[il].wo, model.layers[il].bo,
8119
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8384
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8120
8385
  }
8121
8386
 
8122
8387
  if (il == n_layer - 1) {
@@ -8245,14 +8510,15 @@ struct llm_build_context {
8245
8510
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8246
8511
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8247
8512
 
8248
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8249
- model.layers[il].wo, model.layers[il].bo,
8250
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8513
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8514
+ model.layers[il].wo, model.layers[il].bo,
8515
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8251
8516
  } else {
8252
8517
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8253
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8518
+
8519
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8254
8520
  model.layers[il].wo, model.layers[il].bo,
8255
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8521
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8256
8522
  }
8257
8523
  }
8258
8524
 
@@ -8394,9 +8660,9 @@ struct llm_build_context {
8394
8660
  );
8395
8661
  cb(Kcur, "Kcur", il);
8396
8662
 
8397
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8663
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8398
8664
  model.layers[il].wo, NULL,
8399
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8665
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8400
8666
  }
8401
8667
 
8402
8668
  if (il == n_layer - 1) {
@@ -8512,9 +8778,9 @@ struct llm_build_context {
8512
8778
  );
8513
8779
  cb(Kcur, "Kcur", il);
8514
8780
 
8515
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8781
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8516
8782
  model.layers[il].wo, NULL,
8517
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8783
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8518
8784
  }
8519
8785
 
8520
8786
  if (il == n_layer - 1) {
@@ -8625,9 +8891,9 @@ struct llm_build_context {
8625
8891
  );
8626
8892
  cb(Kcur, "Kcur", il);
8627
8893
 
8628
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8894
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8629
8895
  model.layers[il].wo, model.layers[il].bo,
8630
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8896
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8631
8897
  }
8632
8898
 
8633
8899
  if (il == n_layer - 1) {
@@ -8739,9 +9005,9 @@ struct llm_build_context {
8739
9005
  );
8740
9006
  cb(Kcur, "Kcur", il);
8741
9007
 
8742
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9008
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8743
9009
  model.layers[il].wo, model.layers[il].bo,
8744
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9010
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8745
9011
  }
8746
9012
 
8747
9013
  if (il == n_layer - 1) {
@@ -8894,9 +9160,9 @@ struct llm_build_context {
8894
9160
  );
8895
9161
  cb(Kcur, "Kcur", il);
8896
9162
 
8897
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9163
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8898
9164
  model.layers[il].wo, model.layers[il].bo,
8899
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9165
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8900
9166
  }
8901
9167
 
8902
9168
  if (il == n_layer - 1) {
@@ -8938,12 +9204,140 @@ struct llm_build_context {
8938
9204
 
8939
9205
  cur = ggml_add(ctx0, cur, model.output_b);
8940
9206
  cb(cur, "result_output", -1);
9207
+ ggml_build_forward_expand(gf, cur);
9208
+ return gf;
9209
+ }
9210
+
9211
+ struct ggml_cgraph * build_phi3() {
9212
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9213
+
9214
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9215
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
9216
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9217
+
9218
+ struct ggml_tensor * cur;
9219
+ struct ggml_tensor * inpL;
9220
+
9221
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
9222
+
9223
+ // inp_pos - contains the positions
9224
+ struct ggml_tensor * inp_pos = build_inp_pos();
9225
+
9226
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9227
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9228
+
9229
+ for (int il = 0; il < n_layer; ++il) {
9230
+ auto residual = inpL;
9231
+
9232
+ // self-attention
9233
+ {
9234
+ struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9235
+ model.layers[il].attn_norm,
9236
+ NULL,
9237
+ LLM_NORM_RMS, cb, il);
9238
+ cb(attn_norm_output, "attn_norm", il);
9239
+
9240
+ struct ggml_tensor * Qcur = nullptr;
9241
+ struct ggml_tensor * Kcur = nullptr;
9242
+ struct ggml_tensor * Vcur = nullptr;
9243
+
9244
+ if (model.layers[il].wqkv) {
9245
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
9246
+ cb(cur, "wqkv", il);
9247
+
9248
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
9249
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
9250
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9251
+ }
9252
+ else {
9253
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9254
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9255
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9256
+ }
9257
+
9258
+ cb(Qcur, "Qcur", il);
9259
+ cb(Kcur, "Kcur", il);
9260
+ cb(Vcur, "Vcur", il);
9261
+
9262
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9263
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9264
+
9265
+ Qcur = ggml_rope_custom(
9266
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9267
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9268
+ );
9269
+ cb(Qcur, "Qcur", il);
9270
+
9271
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9272
+ cb(Qcur, "Qcur", il);
9273
+
9274
+ Kcur = ggml_rope_custom(
9275
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9276
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9277
+ );
9278
+ cb(Kcur, "Kcur", il);
9279
+
9280
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9281
+ model.layers[il].wo, model.layers[il].bo,
9282
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9283
+ }
9284
+
9285
+ if (il == n_layer - 1) {
9286
+ // skip computing output for unused tokens
9287
+ struct ggml_tensor* inp_out_ids = build_inp_out_ids();
9288
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9289
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
9290
+ }
9291
+
9292
+ cur = ggml_add(ctx0, cur, residual);
9293
+ residual = cur;
9294
+
9295
+ cur = llm_build_norm(ctx0, cur, hparams,
9296
+ model.layers[il].ffn_norm, NULL,
9297
+ LLM_NORM_RMS, cb, il);
9298
+ cb(cur, "ffn_norm", il);
9299
+
9300
+ // FF
9301
+ // special-case: the up and gate tensors are merged into a single tensor
9302
+ // TOOD: support into llm_build_ffn
9303
+ {
9304
+ struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
9305
+ cb(up, "ffn_up", il);
9306
+
9307
+ auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
9308
+ auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
9309
+
9310
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
9311
+ cb(y, "ffn_gate", il);
9312
+
9313
+ auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
9314
+ cb(down, "ffn_down", il);
9315
+
9316
+ cur = down;
9317
+ cb(cur, "ffn_out", il);
9318
+ }
9319
+
9320
+ cur = ggml_add(ctx0, residual, cur);
9321
+ cb(cur, "l_out", il);
9322
+
9323
+ inpL = cur;
9324
+ }
9325
+
9326
+ cur = llm_build_norm(ctx0, inpL, hparams,
9327
+ model.output_norm,
9328
+ NULL,
9329
+ LLM_NORM_RMS, cb, -1);
9330
+ cb(cur, "result_norm", -1);
9331
+
9332
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9333
+ cb(cur, "result_output", -1);
8941
9334
 
8942
9335
  ggml_build_forward_expand(gf, cur);
8943
9336
 
8944
9337
  return gf;
8945
9338
  }
8946
9339
 
9340
+
8947
9341
  struct ggml_cgraph * build_plamo() {
8948
9342
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
8949
9343
 
@@ -8996,9 +9390,9 @@ struct llm_build_context {
8996
9390
  ext_factor, attn_factor, beta_fast, beta_slow);
8997
9391
  cb(Kcur, "Kcur", il);
8998
9392
 
8999
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9393
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9000
9394
  model.layers[il].wo, NULL,
9001
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9395
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9002
9396
  }
9003
9397
  struct ggml_tensor * sa_out = cur;
9004
9398
 
@@ -9099,9 +9493,9 @@ struct llm_build_context {
9099
9493
 
9100
9494
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9101
9495
 
9102
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9103
9497
  model.layers[il].wo, model.layers[il].bo,
9104
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9105
9499
  }
9106
9500
 
9107
9501
  if (il == n_layer - 1) {
@@ -9206,9 +9600,9 @@ struct llm_build_context {
9206
9600
  );
9207
9601
  cb(Kcur, "Kcur", il);
9208
9602
 
9209
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9603
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9210
9604
  model.layers[il].wo, model.layers[il].bo,
9211
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9605
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9212
9606
  }
9213
9607
 
9214
9608
  if (il == n_layer - 1) {
@@ -9322,9 +9716,9 @@ struct llm_build_context {
9322
9716
  );
9323
9717
  cb(Kcur, "Kcur", il);
9324
9718
 
9325
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9719
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9326
9720
  model.layers[il].wo, NULL,
9327
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9721
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9328
9722
  }
9329
9723
 
9330
9724
  if (il == n_layer - 1) {
@@ -9439,9 +9833,9 @@ struct llm_build_context {
9439
9833
  );
9440
9834
  cb(Kcur, "Kcur", il);
9441
9835
 
9442
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9836
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9443
9837
  model.layers[il].wo, model.layers[il].bo,
9444
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9838
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9445
9839
  }
9446
9840
 
9447
9841
  if (il == n_layer - 1) {
@@ -9569,9 +9963,9 @@ struct llm_build_context {
9569
9963
  );
9570
9964
  cb(Kcur, "Kcur", il);
9571
9965
 
9572
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9966
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9573
9967
  model.layers[il].wo, model.layers[il].bo,
9574
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9968
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9575
9969
  }
9576
9970
 
9577
9971
  if (il == n_layer - 1) {
@@ -9690,9 +10084,9 @@ struct llm_build_context {
9690
10084
  ext_factor, attn_factor, beta_fast, beta_slow);
9691
10085
  cb(Kcur, "Kcur", il);
9692
10086
 
9693
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10087
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9694
10088
  model.layers[il].wo, NULL,
9695
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10089
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9696
10090
  }
9697
10091
 
9698
10092
  if (il == n_layer - 1) {
@@ -9809,9 +10203,9 @@ struct llm_build_context {
9809
10203
  );
9810
10204
  cb(Kcur, "Kcur", il);
9811
10205
 
9812
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10206
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9813
10207
  model.layers[il].wo, model.layers[il].bo,
9814
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10208
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9815
10209
  }
9816
10210
 
9817
10211
  if (il == n_layer - 1) {
@@ -10099,9 +10493,9 @@ struct llm_build_context {
10099
10493
  );
10100
10494
  cb(Kcur, "Kcur", il);
10101
10495
 
10102
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10496
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10103
10497
  model.layers[il].wo, model.layers[il].bo,
10104
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10498
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10105
10499
  }
10106
10500
 
10107
10501
  if (il == n_layer - 1) {
@@ -10230,9 +10624,9 @@ struct llm_build_context {
10230
10624
  );
10231
10625
  cb(Kcur, "Kcur", il);
10232
10626
 
10233
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10627
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10234
10628
  model.layers[il].wo, nullptr,
10235
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10629
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10236
10630
  }
10237
10631
 
10238
10632
  if (il == n_layer - 1) {
@@ -10445,6 +10839,10 @@ static struct ggml_cgraph * llama_build_graph(
10445
10839
  {
10446
10840
  result = llm.build_phi2();
10447
10841
  } break;
10842
+ case LLM_ARCH_PHI3:
10843
+ {
10844
+ result = llm.build_phi3();
10845
+ } break;
10448
10846
  case LLM_ARCH_PLAMO:
10449
10847
  {
10450
10848
  result = llm.build_plamo();
@@ -10655,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
10655
11053
  }
10656
11054
  }
10657
11055
 
10658
- if (hparams.need_kq_pos) {
11056
+ // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11057
+ // this allows to process multiple sequences in parallel with ALiBi-based models
11058
+ if (hparams.use_alibi) {
10659
11059
  const int64_t n_kv = kv_self.n;
10660
11060
 
10661
11061
  GGML_ASSERT(lctx.inp_KQ_pos);
@@ -11037,7 +11437,7 @@ static int llama_decode_internal(
11037
11437
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11038
11438
  // after enough generations, the benefit from this heuristic disappears
11039
11439
  // if we start defragmenting the cache, the benefit from this will be more important
11040
- kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
11440
+ kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11041
11441
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11042
11442
  }
11043
11443
  }
@@ -11205,6 +11605,10 @@ static int llama_decode_internal(
11205
11605
  }
11206
11606
  }
11207
11607
 
11608
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
11609
+ // overlap with device computation.
11610
+ ggml_backend_sched_reset(lctx.sched);
11611
+
11208
11612
  return 0;
11209
11613
  }
11210
11614
 
@@ -11230,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
11230
11634
  // each move requires 6*n_layer tensors (see build_defrag)
11231
11635
  // - source view, destination view, copy operation
11232
11636
  // - x2 for keys and values
11233
- const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11637
+ //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11638
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11639
+ const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
11234
11640
 
11235
11641
  // determine which KV cells to move where
11236
11642
  //
@@ -11554,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11554
11960
  }
11555
11961
  case LLAMA_VOCAB_TYPE_BPE: {
11556
11962
  GGML_ASSERT(false);
11557
- return unicode_utf8_to_byte(token_data.text);
11963
+ return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
11558
11964
  }
11559
11965
  case LLAMA_VOCAB_TYPE_WPM: {
11560
11966
  GGML_ASSERT(false);
@@ -11776,7 +12182,79 @@ struct llm_tokenizer_bpe {
11776
12182
 
11777
12183
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
11778
12184
  int final_prev_index = -1;
11779
- auto word_collection = bpe_gpt2_preprocess(text);
12185
+
12186
+ std::vector<std::string> word_collection;
12187
+ switch (vocab.type) {
12188
+ case LLAMA_VOCAB_TYPE_BPE:
12189
+ switch (vocab.type_pre) {
12190
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12191
+ word_collection = unicode_regex_split(text, {
12192
+ // original regex from tokenizer.json
12193
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12194
+
12195
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12196
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12197
+ });
12198
+ break;
12199
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12200
+ word_collection = unicode_regex_split(text, {
12201
+ "[\r\n]",
12202
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12203
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12204
+ "\\s+$",
12205
+ "[一-龥ࠀ-一가-퟿]+",
12206
+ "\\p{N}+",
12207
+ });
12208
+ break;
12209
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12210
+ word_collection = unicode_regex_split(text, {
12211
+ "[\r\n]",
12212
+ "\\s?\\p{L}+",
12213
+ "\\s?\\p{P}+",
12214
+ "[一-龥ࠀ-一가-퟿]+",
12215
+ "\\p{N}+",
12216
+ });
12217
+ break;
12218
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
12219
+ word_collection = unicode_regex_split(text, {
12220
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12221
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222
+ "\\p{N}+",
12223
+ "[0-9][0-9][0-9]",
12224
+ });
12225
+ break;
12226
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
12227
+ // TODO: MPT pre-tokenization regexes are unknown
12228
+ // the following are close, but not exact. run the following:
12229
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12230
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12231
+ word_collection = unicode_regex_split(text, {
12232
+ "\\s?\\p{L}+",
12233
+ "\\s?\\p{P}+",
12234
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12235
+ });
12236
+ break;
12237
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12238
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
12239
+ word_collection = unicode_regex_split(text, {
12240
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
+ });
12242
+ break;
12243
+ default:
12244
+ // default regex for BPE tokenization pre-processing
12245
+ word_collection = unicode_regex_split(text, {
12246
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12247
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12248
+ "\\p{N}+",
12249
+ "[0-9][0-9][0-9]",
12250
+ });
12251
+ break;
12252
+ }
12253
+ break;
12254
+ default:
12255
+ GGML_ASSERT(false);
12256
+ break;
12257
+ }
11780
12258
 
11781
12259
  symbols_final.clear();
11782
12260
 
@@ -11903,145 +12381,6 @@ private:
11903
12381
  work_queue.push(bigram);
11904
12382
  }
11905
12383
 
11906
- std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
11907
- std::vector<std::string> bpe_words;
11908
- std::vector<std::string> bpe_encoded_words;
11909
-
11910
- std::string token = "";
11911
- // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
11912
- bool collecting_numeric = false;
11913
- bool collecting_letter = false;
11914
- bool collecting_special = false;
11915
- bool collecting_whitespace_lookahead = false;
11916
- bool collecting = false;
11917
-
11918
- std::vector<std::string> text_utf;
11919
- text_utf.reserve(text.size());
11920
- bpe_words.reserve(text.size());
11921
- bpe_encoded_words.reserve(text.size());
11922
-
11923
- const auto cpts = unicode_cpts_from_utf8(text);
11924
- for (size_t i = 0; i < cpts.size(); ++i)
11925
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
11926
-
11927
- for (int i = 0; i < (int)text_utf.size(); i++) {
11928
- const std::string & utf_char = text_utf[i];
11929
- bool split_condition = false;
11930
- int bytes_remain = text_utf.size() - i;
11931
- // forward backward lookups
11932
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
11933
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
11934
-
11935
- // handling contractions
11936
- if (!split_condition && bytes_remain >= 2) {
11937
- // 's|'t|'m|'d
11938
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
11939
- split_condition = true;
11940
- }
11941
- if (split_condition) {
11942
- if (token.size()) {
11943
- bpe_words.emplace_back(token); // push previous content as token
11944
- }
11945
- token = utf_char + utf_char_next;
11946
- bpe_words.emplace_back(token);
11947
- token = "";
11948
- i++;
11949
- continue;
11950
- }
11951
- }
11952
- if (!split_condition && bytes_remain >= 3) {
11953
- // 're|'ve|'ll
11954
- if (utf_char == "\'" && (
11955
- (utf_char_next == "r" && utf_char_next_next == "e") ||
11956
- (utf_char_next == "v" && utf_char_next_next == "e") ||
11957
- (utf_char_next == "l" && utf_char_next_next == "l"))
11958
- ) {
11959
- split_condition = true;
11960
- }
11961
- if (split_condition) {
11962
- // current token + next token can be defined
11963
- if (token.size()) {
11964
- bpe_words.emplace_back(token); // push previous content as token
11965
- }
11966
- token = utf_char + utf_char_next + utf_char_next_next;
11967
- bpe_words.emplace_back(token); // the contraction
11968
- token = "";
11969
- i += 2;
11970
- continue;
11971
- }
11972
- }
11973
-
11974
- if (!split_condition && !collecting) {
11975
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
11976
- collecting_letter = true;
11977
- collecting = true;
11978
- }
11979
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
11980
- collecting_numeric = true;
11981
- collecting = true;
11982
- }
11983
- else if (
11984
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
11985
- (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
11986
- ) {
11987
- collecting_special = true;
11988
- collecting = true;
11989
- }
11990
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
11991
- collecting_whitespace_lookahead = true;
11992
- collecting = true;
11993
- }
11994
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
11995
- split_condition = true;
11996
- }
11997
- }
11998
- else if (!split_condition && collecting) {
11999
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
12000
- split_condition = true;
12001
- }
12002
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
12003
- split_condition = true;
12004
- }
12005
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
12006
- split_condition = true;
12007
- }
12008
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12009
- split_condition = true;
12010
- }
12011
- }
12012
-
12013
- if (utf_char_next == "") {
12014
- split_condition = true; // final
12015
- token += utf_char;
12016
- }
12017
-
12018
- if (split_condition) {
12019
- if (token.size()) {
12020
- bpe_words.emplace_back(token);
12021
- }
12022
- token = utf_char;
12023
- collecting = false;
12024
- collecting_letter = false;
12025
- collecting_numeric = false;
12026
- collecting_special = false;
12027
- collecting_whitespace_lookahead = false;
12028
- }
12029
- else {
12030
- token += utf_char;
12031
- }
12032
- }
12033
-
12034
- for (std::string & word : bpe_words) {
12035
- std::string encoded_token = "";
12036
- for (char & c : word) {
12037
- encoded_token += unicode_byte_to_utf8(c);
12038
- }
12039
- bpe_encoded_words.emplace_back(encoded_token);
12040
- }
12041
-
12042
- return bpe_encoded_words;
12043
- }
12044
-
12045
12384
  const llama_vocab & vocab;
12046
12385
 
12047
12386
  std::vector<llm_symbol> symbols;
@@ -12361,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12361
12700
  } break;
12362
12701
  case LLAMA_VOCAB_TYPE_BPE:
12363
12702
  {
12364
- if (add_special && vocab.special_add_bos == 1) {
12703
+ if (add_special && vocab.special_add_bos != 0) {
12365
12704
  GGML_ASSERT(vocab.special_bos_id != -1);
12366
12705
  output.push_back(vocab.special_bos_id);
12367
12706
  }
@@ -13268,16 +13607,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13268
13607
  GGML_ASSERT(ctx);
13269
13608
  const int64_t t_start_sample_us = ggml_time_us();
13270
13609
 
13271
- bool allow_eos = false;
13610
+ bool allow_eog = false;
13272
13611
  for (const auto & stack : grammar->stacks) {
13273
13612
  if (stack.empty()) {
13274
- allow_eos = true;
13613
+ allow_eog = true;
13275
13614
  break;
13276
13615
  }
13277
13616
  }
13278
13617
 
13279
- const llama_token eos = llama_token_eos(&ctx->model);
13280
-
13281
13618
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
13282
13619
  candidates_decoded.reserve(candidates->size);
13283
13620
  std::vector<llama_grammar_candidate> candidates_grammar;
@@ -13285,9 +13622,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13285
13622
 
13286
13623
  for (size_t i = 0; i < candidates->size; ++i) {
13287
13624
  const llama_token id = candidates->data[i].id;
13288
- const std::string piece = llama_token_to_piece(ctx, id);
13289
- if (id == eos) {
13290
- if (!allow_eos) {
13625
+ const std::string piece = llama_token_to_piece(ctx, id, false);
13626
+
13627
+ if (llama_token_is_eog(&ctx->model, id)) {
13628
+ if (!allow_eog) {
13291
13629
  candidates->data[i].logit = -INFINITY;
13292
13630
  }
13293
13631
  } else if (piece.empty() || piece[0] == 0) {
@@ -13450,7 +13788,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
13450
13788
  return result;
13451
13789
  }
13452
13790
 
13453
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13791
+ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
13454
13792
  GGML_ASSERT(ctx);
13455
13793
 
13456
13794
  const int64_t t_start_sample_us = ggml_time_us();
@@ -13463,7 +13801,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13463
13801
  }
13464
13802
 
13465
13803
  std::discrete_distribution<> dist(probs.begin(), probs.end());
13466
- auto & rng = ctx->rng;
13467
13804
  int idx = dist(rng);
13468
13805
 
13469
13806
  llama_token result = candidates->data[idx].id;
@@ -13473,10 +13810,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13473
13810
  return result;
13474
13811
  }
13475
13812
 
13813
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13814
+ return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
13815
+ }
13816
+
13476
13817
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
13477
13818
  const int64_t t_start_sample_us = ggml_time_us();
13478
13819
 
13479
- if (token == llama_token_eos(&ctx->model)) {
13820
+ if (llama_token_is_eog(&ctx->model, token)) {
13480
13821
  for (const auto & stack : grammar->stacks) {
13481
13822
  if (stack.empty()) {
13482
13823
  return;
@@ -13485,7 +13826,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
13485
13826
  GGML_ASSERT(false);
13486
13827
  }
13487
13828
 
13488
- const std::string piece = llama_token_to_piece(ctx, token);
13829
+ const std::string piece = llama_token_to_piece(ctx, token, false);
13489
13830
 
13490
13831
  // Note terminating 0 in decoded string
13491
13832
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -14131,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14131
14472
  }
14132
14473
 
14133
14474
  static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14134
- std::mutex mutex;
14135
- int64_t counter = 0;
14136
- size_t new_size = 0;
14137
14475
  if (nthread < 2) {
14138
14476
  // single-thread
14139
- return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14477
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14478
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
14479
+ throw std::runtime_error("quantized data validation failed");
14480
+ }
14481
+ return new_size;
14140
14482
  }
14141
- auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
14483
+
14484
+ std::mutex mutex;
14485
+ int64_t counter = 0;
14486
+ size_t new_size = 0;
14487
+ bool valid = true;
14488
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
14142
14489
  nrows, n_per_row, imatrix]() {
14143
14490
  const int64_t nrows_per_chunk = chunk_size / n_per_row;
14144
14491
  size_t local_size = 0;
@@ -14153,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14153
14500
  }
14154
14501
  lock.unlock();
14155
14502
  const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14156
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14503
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14504
+ local_size += this_size;
14505
+
14506
+ // validate the quantized data
14507
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
14508
+ void * this_data = (char *) new_data + first_row * row_size;
14509
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
14510
+ std::unique_lock<std::mutex> lock(mutex);
14511
+ valid = false;
14512
+ break;
14513
+ }
14157
14514
  }
14158
14515
  };
14159
14516
  for (int it = 0; it < nthread - 1; ++it) {
@@ -14162,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14162
14519
  compute();
14163
14520
  for (auto & w : workers) { w.join(); }
14164
14521
  workers.clear();
14522
+ if (!valid) {
14523
+ throw std::runtime_error("quantized data validation failed");
14524
+ }
14165
14525
  return new_size;
14166
14526
  }
14167
14527
 
@@ -14224,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14224
14584
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
14225
14585
  kv_overrides = v->data();
14226
14586
  }
14227
- llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
14587
+ llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
14228
14588
  ml.init_mappings(false); // no prefetching
14229
14589
 
14230
14590
  llama_model model;
@@ -14262,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14262
14622
  for (auto & o : overrides) {
14263
14623
  if (o.key[0] == 0) break;
14264
14624
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
14265
- gguf_set_val_f32(ctx_out, o.key, o.float_value);
14625
+ gguf_set_val_f32(ctx_out, o.key, o.val_f64);
14266
14626
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
14267
- gguf_set_val_i32(ctx_out, o.key, o.int_value);
14627
+ gguf_set_val_i32(ctx_out, o.key, o.val_i64);
14268
14628
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
14269
- gguf_set_val_bool(ctx_out, o.key, o.bool_value);
14629
+ gguf_set_val_bool(ctx_out, o.key, o.val_bool);
14630
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
14631
+ gguf_set_val_str(ctx_out, o.key, o.val_str);
14270
14632
  } else {
14271
14633
  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
14272
14634
  }
@@ -14308,26 +14670,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14308
14670
  std::vector<no_init<uint8_t>> work;
14309
14671
  std::vector<no_init<float>> f32_conv_buf;
14310
14672
 
14673
+ uint16_t n_split = 1;
14674
+ // Assume split index is continuous
14675
+ if (params->keep_split) {
14676
+ for (int i = 0; i < ml.n_tensors; ++i) {
14677
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14678
+ }
14679
+ }
14680
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14681
+ ctx_outs[0] = ctx_out;
14682
+
14311
14683
  // populate the original tensors so we get an initial meta data
14312
14684
  for (int i = 0; i < ml.n_tensors; ++i) {
14313
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
14314
- gguf_add_tensor(ctx_out, meta);
14685
+ auto weight = ml.get_weight(i);
14686
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14687
+ struct ggml_tensor * tensor = weight->tensor;
14688
+ if (ctx_outs[i_split] == NULL) {
14689
+ ctx_outs[i_split] = gguf_init_empty();
14690
+ }
14691
+ gguf_add_tensor(ctx_outs[i_split], tensor);
14315
14692
  }
14316
14693
 
14317
- std::ofstream fout(fname_out, std::ios::binary);
14318
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14319
-
14320
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14694
+ // Set split info if needed
14695
+ if (n_split > 1) {
14696
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14697
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14698
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14699
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14700
+ }
14701
+ }
14321
14702
 
14322
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14703
+ int cur_split = -1;
14704
+ std::ofstream fout;
14705
+ auto close_ofstream = [&]() {
14706
+ // Write metadata and close file handler
14707
+ if (fout.is_open()) {
14708
+ fout.seekp(0);
14709
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14710
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14711
+ fout.write((const char *) data.data(), data.size());
14712
+ fout.close();
14713
+ }
14714
+ };
14715
+ auto new_ofstream = [&](int index) {
14716
+ cur_split = index;
14717
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14718
+ std::string fname = fname_out;
14719
+ if (params->keep_split) {
14720
+ char split_path[PATH_MAX] = {0};
14721
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14722
+ fname = std::string(split_path);
14723
+ }
14323
14724
 
14324
- // placeholder for the meta data
14325
- ::zeros(fout, meta_size);
14725
+ fout = std::ofstream(fname, std::ios::binary);
14726
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14727
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14728
+ // placeholder for the meta data
14729
+ ::zeros(fout, meta_size);
14730
+ };
14326
14731
 
14327
14732
  const auto tn = LLM_TN(model.arch);
14328
-
14733
+ new_ofstream(0);
14329
14734
  for (int i = 0; i < ml.n_tensors; ++i) {
14330
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14735
+ auto weight = ml.get_weight(i);
14736
+ struct ggml_tensor * tensor = weight->tensor;
14737
+ if (weight->idx != cur_split && params->keep_split) {
14738
+ close_ofstream();
14739
+ new_ofstream(weight->idx);
14740
+ }
14331
14741
 
14332
14742
  const std::string name = ggml_get_name(tensor);
14333
14743
 
@@ -14482,26 +14892,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14482
14892
  total_size_new += new_size;
14483
14893
 
14484
14894
  // update the gguf meta data as we go
14485
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
14486
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14895
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14896
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
14487
14897
 
14488
14898
  // write tensor data + padding
14489
14899
  fout.write((const char *) new_data, new_size);
14490
14900
  zeros(fout, GGML_PAD(new_size, align) - new_size);
14491
14901
  }
14492
-
14493
- // go back to beginning of file and write the updated meta data
14494
- {
14495
- fout.seekp(0);
14496
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
14497
- gguf_get_meta_data(ctx_out, data.data());
14498
- fout.write((const char *) data.data(), data.size());
14902
+ close_ofstream();
14903
+ for (auto & c:ctx_outs) {
14904
+ gguf_free(c);
14499
14905
  }
14500
14906
 
14501
- fout.close();
14502
-
14503
- gguf_free(ctx_out);
14504
-
14505
14907
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
14506
14908
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
14507
14909
 
@@ -14545,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
14545
14947
  std::unique_ptr<llama_model_loader> ml;
14546
14948
  if (path_base_model) {
14547
14949
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
14548
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
14950
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
14549
14951
  ml->init_mappings(/*prefetch*/ false); // no prefetching
14550
14952
  }
14551
14953
 
@@ -14804,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
14804
15206
  /*.vocab_only =*/ false,
14805
15207
  /*.use_mmap =*/ true,
14806
15208
  /*.use_mlock =*/ false,
15209
+ /*.check_tensors =*/ false,
14807
15210
  };
14808
15211
 
14809
15212
  #ifdef GGML_USE_METAL
@@ -14840,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
14840
15243
  /*.logits_all =*/ false,
14841
15244
  /*.embeddings =*/ false,
14842
15245
  /*.offload_kqv =*/ true,
15246
+ /*.flash_attn =*/ false,
14843
15247
  /*.abort_callback =*/ nullptr,
14844
15248
  /*.abort_callback_data =*/ nullptr,
14845
15249
  };
@@ -14857,6 +15261,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
14857
15261
  /*.quantize_output_tensor =*/ true,
14858
15262
  /*.only_copy =*/ false,
14859
15263
  /*.pure =*/ false,
15264
+ /*.keep_split =*/ false,
14860
15265
  /*.imatrix =*/ nullptr,
14861
15266
  /*.kv_overrides =*/ nullptr,
14862
15267
  };
@@ -15005,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
15005
15410
  cparams.defrag_thold = params.defrag_thold;
15006
15411
  cparams.embeddings = params.embeddings;
15007
15412
  cparams.offload_kqv = params.offload_kqv;
15413
+ cparams.flash_attn = params.flash_attn;
15008
15414
  cparams.pooling_type = params.pooling_type;
15009
15415
 
15010
15416
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -15012,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
15012
15418
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15013
15419
 
15014
15420
  // this is necessary due to kv_self.n being padded later during inference
15015
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
15421
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15016
15422
 
15017
15423
  // with causal attention, the batch size is limited by the context size
15018
15424
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
15019
- cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15020
15425
 
15426
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
15427
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
15428
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
15429
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
15430
+ LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
15431
+ cparams.n_batch = GGML_KQ_MASK_PAD;
15432
+ }
15433
+
15434
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15021
15435
 
15022
15436
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15023
15437
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
@@ -15049,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
15049
15463
  }
15050
15464
  }
15051
15465
 
15466
+ if (cparams.flash_attn && hparams.use_alibi) {
15467
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15468
+ cparams.flash_attn = false;
15469
+ }
15470
+
15471
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15472
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15473
+ cparams.flash_attn = false;
15474
+ }
15475
+
15476
+ #ifdef GGML_USE_HIPBLAS
15477
+ if (cparams.flash_attn) {
15478
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
15479
+ cparams.flash_attn = false;
15480
+ }
15481
+ #endif
15482
+
15052
15483
  if (params.seed == LLAMA_DEFAULT_SEED) {
15053
15484
  params.seed = time(NULL);
15054
15485
  }
@@ -15056,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
15056
15487
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
15057
15488
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
15058
15489
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
15490
+ LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
15059
15491
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
15060
15492
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
15061
15493
 
@@ -15184,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
15184
15616
  }
15185
15617
  ctx->backends.push_back(ctx->backend_cpu);
15186
15618
 
15187
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
15619
+ if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
15188
15620
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
15189
15621
  llama_free(ctx);
15190
15622
  return nullptr;
@@ -15365,6 +15797,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15365
15797
  case LLM_ARCH_QWEN2:
15366
15798
  case LLM_ARCH_QWEN2MOE:
15367
15799
  case LLM_ARCH_PHI2:
15800
+ case LLM_ARCH_PHI3:
15368
15801
  case LLM_ARCH_GEMMA:
15369
15802
  case LLM_ARCH_STARCODER2:
15370
15803
  return LLAMA_ROPE_TYPE_NEOX;
@@ -15378,6 +15811,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15378
15811
  return LLAMA_ROPE_TYPE_NONE;
15379
15812
  }
15380
15813
 
15814
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
15815
+ return ctx->cparams.pooling_type;
15816
+ }
15817
+
15381
15818
  int32_t llama_n_vocab(const struct llama_model * model) {
15382
15819
  return model->hparams.n_vocab;
15383
15820
  }
@@ -15778,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
15778
16215
  const size_t s_kv_head = sizeof(uint32_t);
15779
16216
  const size_t s_kv_size = sizeof(uint32_t);
15780
16217
  const size_t s_kv_used = sizeof(uint32_t);
16218
+ const size_t s_v_trans = sizeof(uint32_t);
15781
16219
  const size_t s_kv = ctx->kv_self.total_size();
15782
16220
  const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
15783
16221
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
@@ -15795,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
15795
16233
  + s_kv_head
15796
16234
  + s_kv_size
15797
16235
  + s_kv_used
16236
+ + s_v_trans
15798
16237
  + s_kv
15799
16238
  + s_kv_cells
15800
16239
  );
15801
16240
 
16241
+ // on session change it is very likely that the state size has changed - so we need to update this function
16242
+ static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
16243
+
15802
16244
  return s_total;
15803
16245
  }
15804
16246
 
@@ -15856,6 +16298,8 @@ struct llama_data_file_context : llama_data_context {
15856
16298
  *
15857
16299
  */
15858
16300
  static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
16301
+ llama_synchronize(ctx);
16302
+
15859
16303
  // copy rng
15860
16304
  {
15861
16305
  std::ostringstream rng_ss;
@@ -15942,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
15942
16386
  const uint32_t kv_size = kv_self.size;
15943
16387
  const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
15944
16388
  const uint32_t kv_used = kv_self.used;
16389
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
15945
16390
 
15946
16391
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
15947
16392
  data_ctx->write(&kv_head, sizeof(kv_head));
15948
16393
  data_ctx->write(&kv_size, sizeof(kv_size));
15949
16394
  data_ctx->write(&kv_used, sizeof(kv_used));
16395
+ data_ctx->write(&v_trans, sizeof(v_trans));
15950
16396
 
15951
16397
  if (kv_buf_size) {
15952
16398
  const size_t pre_kv_buf_size = data_ctx->get_size_written();
@@ -15959,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
15959
16405
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
15960
16406
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
15961
16407
 
15962
- if (kv_self.recurrent) {
16408
+ if (kv_self.recurrent || !kv_self.v_trans) {
15963
16409
  // v is contiguous for recurrent models
15964
16410
  // TODO: use other tensors for state models than k and v
15965
16411
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16008,6 +16454,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
16008
16454
 
16009
16455
  // Sets the state reading from the specified source address
16010
16456
  size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16457
+ llama_synchronize(ctx);
16458
+
16011
16459
  const uint8_t * inp = src;
16012
16460
 
16013
16461
  // set rng
@@ -16090,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16090
16538
  uint32_t kv_head;
16091
16539
  uint32_t kv_size;
16092
16540
  uint32_t kv_used;
16541
+ uint32_t v_trans;
16093
16542
 
16094
16543
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
16095
16544
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
16096
16545
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
16097
16546
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
16547
+ memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
16548
+
16549
+ GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
16098
16550
 
16099
16551
  if (kv_self.size != kv_size) {
16100
16552
  // the KV cache needs to be big enough to load all the KV cells from the saved state
@@ -16104,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16104
16556
  __func__, kv_head, kv_size, kv_self.size);
16105
16557
  }
16106
16558
 
16559
+ llama_kv_cache_clear(ctx);
16560
+
16107
16561
  if (kv_buf_size) {
16108
16562
  const size_t pre_kv_buf_size = inp - src;
16109
16563
 
@@ -16115,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16115
16569
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
16116
16570
  inp += k_size;
16117
16571
 
16118
- if (kv_self.recurrent) {
16572
+ if (kv_self.recurrent || !kv_self.v_trans) {
16119
16573
  // v is contiguous for recurrent models
16120
16574
  // TODO: use other tensors for state models than k and v
16121
16575
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16137,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16137
16591
  GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
16138
16592
  }
16139
16593
 
16140
- llama_kv_cache_clear(ctx);
16141
-
16142
16594
  ctx->kv_self.head = kv_head;
16143
16595
  ctx->kv_self.used = kv_used;
16144
16596
 
@@ -16312,6 +16764,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
16312
16764
  }
16313
16765
 
16314
16766
  static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16767
+ llama_synchronize(ctx);
16768
+
16315
16769
  const auto & kv_self = ctx->kv_self;
16316
16770
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16317
16771
 
@@ -16396,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16396
16850
  }
16397
16851
  }
16398
16852
 
16399
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16400
- const uint32_t kv_size = kv_self.size;
16401
- for (int il = 0; il < (int)n_layer; ++il) {
16402
- // Write value type
16403
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16404
- data_ctx.write(&v_type_i, sizeof(v_type_i));
16853
+ // TODO: simplify, reduce copy-paste
16854
+ if (!kv_self.v_trans) {
16855
+ for (int il = 0; il < (int)n_layer; ++il) {
16856
+ // Write value type
16857
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16858
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16405
16859
 
16406
- // Write element size
16407
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16408
- data_ctx.write(&v_size_el, sizeof(v_size_el));
16860
+ // Write row size of value
16861
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
16862
+ data_ctx.write(&v_size_row, sizeof(v_size_row));
16409
16863
 
16410
- // For each row, we get the element values of each cell
16411
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16412
- // Read each range of cells of v_size_el length each into tmp_buf and write out
16864
+ // Read each range of cells of v_size length each into tmp_buf and write out
16413
16865
  for (const auto & range : cell_ranges) {
16414
16866
  const size_t range_size = range.second - range.first;
16415
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16416
- tmp_buf.resize(range_size * v_size_el);
16417
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16867
+ tmp_buf.resize(range_size * v_size_row);
16868
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
16418
16869
  data_ctx.write(tmp_buf.data(), tmp_buf.size());
16419
16870
  }
16420
16871
  }
16872
+ } else {
16873
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16874
+ const uint32_t kv_size = kv_self.size;
16875
+ for (int il = 0; il < (int)n_layer; ++il) {
16876
+ // Write value type
16877
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16878
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16879
+
16880
+ // Write element size
16881
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16882
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16883
+
16884
+ // For each row, we get the element values of each cell
16885
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16886
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16887
+ for (const auto & range : cell_ranges) {
16888
+ const size_t range_size = range.second - range.first;
16889
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16890
+ tmp_buf.resize(range_size * v_size_el);
16891
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16892
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16893
+ }
16894
+ }
16895
+ }
16421
16896
  }
16422
16897
 
16423
16898
  return data_ctx.get_size_written();
@@ -16429,6 +16904,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
16429
16904
  }
16430
16905
 
16431
16906
  size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16907
+ llama_synchronize(ctx);
16908
+
16432
16909
  auto & kv_self = ctx->kv_self;
16433
16910
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16434
16911
 
@@ -16540,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
16540
17017
  }
16541
17018
  }
16542
17019
 
16543
- // For each layer, read the values for each cell (transposed)
16544
- for (int il = 0; il < (int)n_layer; ++il) {
16545
- // Read type of value
16546
- int32_t v_type_i_ref;
16547
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16548
- inp += sizeof(v_type_i_ref);
16549
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16550
- if (v_type_i != v_type_i_ref) {
16551
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16552
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16553
- return 0;
16554
- }
17020
+ // TODO: simplify, reduce copy-paste
17021
+ if (!kv_self.v_trans) {
17022
+ for (int il = 0; il < (int)n_layer; ++il) {
17023
+ // Read type of value
17024
+ int32_t v_type_i_ref;
17025
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17026
+ inp += sizeof(v_type_i_ref);
17027
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17028
+ if (v_type_i != v_type_i_ref) {
17029
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17030
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17031
+ return 0;
17032
+ }
16555
17033
 
16556
- // Read element size of value
16557
- size_t v_size_el_ref;
16558
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16559
- inp += sizeof(v_size_el_ref);
16560
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16561
- if (v_size_el != v_size_el_ref) {
16562
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16563
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16564
- return 0;
16565
- }
17034
+ // Read row size of value
17035
+ size_t v_size_row_ref;
17036
+ memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
17037
+ inp += sizeof(v_size_row_ref);
17038
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
17039
+ if (v_size_row != v_size_row_ref) {
17040
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17041
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
17042
+ return 0;
17043
+ }
16566
17044
 
16567
- if (cell_count) {
16568
- // For each row in the transposed matrix, read the values for the whole cell range
16569
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16570
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16571
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16572
- inp += cell_count * v_size_el;
17045
+ if (cell_count) {
17046
+ // Read and set the values for the whole cell range
17047
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
17048
+ inp += cell_count * v_size_row;
17049
+ }
17050
+ }
17051
+ } else {
17052
+ // For each layer, read the values for each cell (transposed)
17053
+ for (int il = 0; il < (int)n_layer; ++il) {
17054
+ // Read type of value
17055
+ int32_t v_type_i_ref;
17056
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17057
+ inp += sizeof(v_type_i_ref);
17058
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17059
+ if (v_type_i != v_type_i_ref) {
17060
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17061
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17062
+ return 0;
17063
+ }
17064
+
17065
+ // Read element size of value
17066
+ size_t v_size_el_ref;
17067
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
17068
+ inp += sizeof(v_size_el_ref);
17069
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
17070
+ if (v_size_el != v_size_el_ref) {
17071
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17072
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
17073
+ return 0;
17074
+ }
17075
+
17076
+ if (cell_count) {
17077
+ // For each row in the transposed matrix, read the values for the whole cell range
17078
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
17079
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
17080
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
17081
+ inp += cell_count * v_size_el;
17082
+ }
16573
17083
  }
16574
17084
  }
16575
17085
  }
16576
17086
 
16577
17087
  const size_t nread = inp - src;
17088
+
16578
17089
  return nread;
16579
17090
  }
16580
17091
 
@@ -16880,6 +17391,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
16880
17391
  return model->vocab.id_to_token[token].type;
16881
17392
  }
16882
17393
 
17394
+ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17395
+ return token != -1 && (
17396
+ token == llama_token_eos(model) ||
17397
+ token == llama_token_eot(model)
17398
+ );
17399
+ }
17400
+
16883
17401
  llama_token llama_token_bos(const struct llama_model * model) {
16884
17402
  return model->vocab.special_bos_id;
16885
17403
  }
@@ -16957,7 +17475,7 @@ static std::string llama_decode_text(const std::string & text) {
16957
17475
  }
16958
17476
 
16959
17477
  // does not write null-terminator to buf
16960
- int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
17478
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
16961
17479
  if (0 <= token && token < llama_n_vocab(model)) {
16962
17480
  switch (llama_vocab_get_type(model->vocab)) {
16963
17481
  case LLAMA_VOCAB_TYPE_WPM:
@@ -16972,7 +17490,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16972
17490
  }
16973
17491
  memcpy(buf, result.c_str(), result.length());
16974
17492
  return result.length();
16975
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17493
+ } else if (
17494
+ (llama_is_user_defined_token(model->vocab, token)) ||
17495
+ (llama_is_control_token (model->vocab, token) && special)) {
16976
17496
  std::string result = model->vocab.id_to_token[token].text;
16977
17497
  if (length < (int) result.length()) {
16978
17498
  return -(int) result.length();
@@ -16985,8 +17505,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16985
17505
  }
16986
17506
  memcpy(buf, "\xe2\x96\x85", 3);
16987
17507
  return 3;
16988
- } else if (llama_is_control_token(model->vocab, token)) {
16989
- ;
16990
17508
  } else if (llama_is_byte_token(model->vocab, token)) {
16991
17509
  if (length < 1) {
16992
17510
  return -1;
@@ -17007,15 +17525,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
17007
17525
  }
17008
17526
  memcpy(buf, result.c_str(), result.length());
17009
17527
  return result.length();
17010
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17528
+ } else if (
17529
+ (llama_is_user_defined_token(model->vocab, token)) ||
17530
+ (llama_is_control_token (model->vocab, token) && special)) {
17011
17531
  std::string result = model->vocab.id_to_token[token].text;
17012
17532
  if (length < (int) result.length()) {
17013
17533
  return -(int) result.length();
17014
17534
  }
17015
17535
  memcpy(buf, result.c_str(), result.length());
17016
17536
  return result.length();
17017
- } else if (llama_is_control_token(model->vocab, token)) {
17018
- ;
17019
17537
  }
17020
17538
  break;
17021
17539
  }
@@ -17213,6 +17731,24 @@ static int32_t llama_chat_apply_template_internal(
17213
17731
  if (add_ass) {
17214
17732
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
17733
  }
17734
+ } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
17735
+ // Llama 3
17736
+ for (auto message : chat) {
17737
+ std::string role(message->role);
17738
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
17739
+ }
17740
+ if (add_ass) {
17741
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17742
+ }
17743
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17744
+ // Phi 3
17745
+ for (auto message : chat) {
17746
+ std::string role(message->role);
17747
+ ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17748
+ }
17749
+ if (add_ass) {
17750
+ ss << "<|assistant|>\n";
17751
+ }
17216
17752
  } else {
17217
17753
  // template not supported
17218
17754
  return -1;
@@ -17345,6 +17881,11 @@ const char * llama_print_system_info(void) {
17345
17881
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
17346
17882
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17347
17883
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17884
+ #ifdef GGML_USE_LLAMAFILE
17885
+ s += "LLAMAFILE = 1 | ";
17886
+ #else
17887
+ s += "LLAMAFILE = 0 | ";
17888
+ #endif
17348
17889
 
17349
17890
  return s.c_str();
17350
17891
  }