llama_cpp 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -68,10 +68,12 @@
68
68
  #include <cstdio>
69
69
  #include <cstring>
70
70
  #include <ctime>
71
+ #include <cwctype>
71
72
  #include <forward_list>
72
73
  #include <fstream>
73
74
  #include <functional>
74
75
  #include <initializer_list>
76
+ #include <locale>
75
77
  #include <map>
76
78
  #include <memory>
77
79
  #include <mutex>
@@ -850,9 +852,9 @@ struct LLM_TN {
850
852
  //
851
853
 
852
854
  static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
853
- { LLAMA_ROPE_SCALING_NONE, "none" },
854
- { LLAMA_ROPE_SCALING_LINEAR, "linear" },
855
- { LLAMA_ROPE_SCALING_YARN, "yarn" },
855
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
856
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
857
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
856
858
  };
857
859
 
858
860
  static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
@@ -862,7 +864,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
862
864
  }
863
865
  }
864
866
 
865
- return LLAMA_ROPE_SCALING_UNSPECIFIED;
867
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
866
868
  }
867
869
 
868
870
  static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1550,8 +1552,9 @@ static const size_t MiB = 1024*kiB;
1550
1552
  static const size_t GiB = 1024*MiB;
1551
1553
 
1552
1554
  struct llama_hparams {
1553
- bool vocab_only;
1554
- bool rope_finetuned;
1555
+ bool vocab_only;
1556
+ bool rope_finetuned;
1557
+
1555
1558
  uint32_t n_vocab;
1556
1559
  uint32_t n_ctx_train; // context size the model was trained on
1557
1560
  uint32_t n_embd;
@@ -1580,7 +1583,8 @@ struct llama_hparams {
1580
1583
  bool causal_attn = true;
1581
1584
  bool need_kq_pos = false;
1582
1585
 
1583
- uint32_t pooling_type = LLAMA_POOLING_NONE;
1586
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1587
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1584
1588
 
1585
1589
  bool operator!=(const llama_hparams & other) const {
1586
1590
  if (this->vocab_only != other.vocab_only) return true;
@@ -1639,8 +1643,8 @@ struct llama_cparams {
1639
1643
  float yarn_attn_factor;
1640
1644
  float yarn_beta_fast;
1641
1645
  float yarn_beta_slow;
1646
+ float defrag_thold;
1642
1647
 
1643
- bool mul_mat_q;
1644
1648
  bool offload_kqv;
1645
1649
  bool do_pooling;
1646
1650
 
@@ -1707,11 +1711,20 @@ struct llama_kv_cell {
1707
1711
  bool has_seq_id(const llama_seq_id & id) const {
1708
1712
  return seq_id.find(id) != seq_id.end();
1709
1713
  }
1714
+
1715
+ bool is_empty() const {
1716
+ return seq_id.empty();
1717
+ }
1718
+
1719
+ bool is_same_seq(const llama_kv_cell & other) const {
1720
+ return seq_id == other.seq_id;
1721
+ }
1710
1722
  };
1711
1723
 
1712
1724
  // ring-buffer of cached KV data
1713
1725
  struct llama_kv_cache {
1714
1726
  bool has_shift = false;
1727
+ bool do_defrag = false;
1715
1728
 
1716
1729
  // Note: The value of head isn't only used to optimize searching
1717
1730
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -1723,6 +1736,9 @@ struct llama_kv_cache {
1723
1736
  // computed before each graph build
1724
1737
  uint32_t n = 0;
1725
1738
 
1739
+ ggml_type type_k = GGML_TYPE_F16;
1740
+ ggml_type type_v = GGML_TYPE_F16;
1741
+
1726
1742
  std::vector<llama_kv_cell> cells;
1727
1743
 
1728
1744
  std::vector<struct ggml_tensor *> k_l; // per layer
@@ -1958,8 +1974,8 @@ struct llama_context {
1958
1974
  static bool llama_kv_cache_init(
1959
1975
  struct llama_kv_cache & cache,
1960
1976
  const llama_model & model,
1961
- ggml_type ktype,
1962
- ggml_type vtype,
1977
+ ggml_type type_k,
1978
+ ggml_type type_v,
1963
1979
  uint32_t n_ctx,
1964
1980
  bool offload) {
1965
1981
  const struct llama_hparams & hparams = model.hparams;
@@ -1974,6 +1990,9 @@ static bool llama_kv_cache_init(
1974
1990
  cache.size = n_ctx;
1975
1991
  cache.used = 0;
1976
1992
 
1993
+ cache.type_k = type_k;
1994
+ cache.type_v = type_v;
1995
+
1977
1996
  cache.cells.clear();
1978
1997
  cache.cells.resize(n_ctx);
1979
1998
 
@@ -2014,8 +2033,8 @@ static bool llama_kv_cache_init(
2014
2033
 
2015
2034
  for (int i = 0; i < (int) n_layer; i++) {
2016
2035
  struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
2017
- ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
2018
- ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
2036
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
2037
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
2019
2038
  ggml_format_name(k, "cache_k_l%d", i);
2020
2039
  ggml_format_name(v, "cache_v_l%d", i);
2021
2040
  cache.k_l.push_back(k);
@@ -2099,7 +2118,7 @@ static bool llama_kv_cache_find_slot(
2099
2118
  // find how many cells are currently in use
2100
2119
  static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2101
2120
  for (uint32_t i = cache.size - 1; i > 0; --i) {
2102
- if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
2121
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
2103
2122
  return i + 1;
2104
2123
  }
2105
2124
  }
@@ -2135,7 +2154,7 @@ static void llama_kv_cache_seq_rm(
2135
2154
  } else {
2136
2155
  continue;
2137
2156
  }
2138
- if (cache.cells[i].seq_id.empty()) {
2157
+ if (cache.cells[i].is_empty()) {
2139
2158
  // keep count of the number of used cells
2140
2159
  if (cache.cells[i].pos >= 0) cache.used--;
2141
2160
 
@@ -2186,7 +2205,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
2186
2205
  if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
2187
2206
  }
2188
2207
 
2189
- static void llama_kv_cache_seq_shift(
2208
+ static void llama_kv_cache_seq_add(
2190
2209
  struct llama_kv_cache & cache,
2191
2210
  llama_seq_id seq_id,
2192
2211
  llama_pos p0,
@@ -2204,10 +2223,14 @@ static void llama_kv_cache_seq_shift(
2204
2223
  cache.cells[i].delta += delta;
2205
2224
 
2206
2225
  if (cache.cells[i].pos < 0) {
2207
- if (!cache.cells[i].seq_id.empty()) cache.used--;
2226
+ if (!cache.cells[i].is_empty()) {
2227
+ cache.used--;
2228
+ }
2208
2229
  cache.cells[i].pos = -1;
2209
2230
  cache.cells[i].seq_id.clear();
2210
- if (new_head == cache.size) new_head = i;
2231
+ if (new_head == cache.size) {
2232
+ new_head = i;
2233
+ }
2211
2234
  }
2212
2235
  }
2213
2236
  }
@@ -2239,6 +2262,22 @@ static void llama_kv_cache_seq_div(
2239
2262
  }
2240
2263
  }
2241
2264
 
2265
+ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
2266
+ llama_pos result = 0;
2267
+
2268
+ for (uint32_t i = 0; i < cache.size; ++i) {
2269
+ if (cache.cells[i].has_seq_id(seq_id)) {
2270
+ result = std::max(result, cache.cells[i].pos);
2271
+ }
2272
+ }
2273
+
2274
+ return result;
2275
+ }
2276
+
2277
+ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2278
+ cache.do_defrag = true;
2279
+ }
2280
+
2242
2281
  //
2243
2282
  // model loading and saving
2244
2283
  //
@@ -2310,7 +2349,7 @@ namespace GGUFMeta {
2310
2349
  }
2311
2350
  };
2312
2351
 
2313
- struct ArrayInfo{
2352
+ struct ArrayInfo {
2314
2353
  const gguf_type gt;
2315
2354
  const size_t length;
2316
2355
  const void * data;
@@ -2329,7 +2368,7 @@ namespace GGUFMeta {
2329
2368
  };
2330
2369
 
2331
2370
  template<typename T>
2332
- class GKV: public GKV_Base<T> {
2371
+ class GKV : public GKV_Base<T> {
2333
2372
  GKV() = delete;
2334
2373
 
2335
2374
  public:
@@ -2345,46 +2384,46 @@ namespace GGUFMeta {
2345
2384
 
2346
2385
  static const char * override_type_to_str(const llama_model_kv_override_type ty) {
2347
2386
  switch (ty) {
2348
- case LLAMA_KV_OVERRIDE_BOOL: return "bool";
2349
- case LLAMA_KV_OVERRIDE_INT: return "int";
2350
- case LLAMA_KV_OVERRIDE_FLOAT: return "float";
2387
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2388
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2389
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2351
2390
  }
2352
2391
  return "unknown";
2353
2392
  }
2354
2393
 
2355
- static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
2356
- if (!override) { return false; }
2357
- if (override->tag == expected_type) {
2394
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
2395
+ if (!ovrd) { return false; }
2396
+ if (ovrd->tag == expected_type) {
2358
2397
  LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
2359
- __func__, override_type_to_str(override->tag), override->key);
2360
- switch (override->tag) {
2361
- case LLAMA_KV_OVERRIDE_BOOL: {
2362
- LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2398
+ __func__, override_type_to_str(ovrd->tag), ovrd->key);
2399
+ switch (ovrd->tag) {
2400
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2401
+ LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2363
2402
  } break;
2364
- case LLAMA_KV_OVERRIDE_INT: {
2365
- LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2403
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
2404
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2366
2405
  } break;
2367
- case LLAMA_KV_OVERRIDE_FLOAT: {
2368
- LLAMA_LOG_INFO("%.6f\n", override->float_value);
2406
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2407
+ LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2369
2408
  } break;
2370
2409
  default:
2371
2410
  // Shouldn't be possible to end up here, but just in case...
2372
2411
  throw std::runtime_error(
2373
2412
  format("Unsupported attempt to override %s type for metadata key %s\n",
2374
- override_type_to_str(override->tag), override->key));
2413
+ override_type_to_str(ovrd->tag), ovrd->key));
2375
2414
  }
2376
2415
  return true;
2377
2416
  }
2378
2417
  LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
2379
- __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
2418
+ __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
2380
2419
  return false;
2381
2420
  }
2382
2421
 
2383
2422
  template<typename OT>
2384
2423
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2385
- try_override(OT & target, const struct llama_model_kv_override *override) {
2386
- if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
2387
- target = override->bool_value;
2424
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2425
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2426
+ target = ovrd->bool_value;
2388
2427
  return true;
2389
2428
  }
2390
2429
  return false;
@@ -2392,9 +2431,9 @@ namespace GGUFMeta {
2392
2431
 
2393
2432
  template<typename OT>
2394
2433
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2395
- try_override(OT & target, const struct llama_model_kv_override *override) {
2396
- if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
2397
- target = override->int_value;
2434
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2435
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2436
+ target = ovrd->int_value;
2398
2437
  return true;
2399
2438
  }
2400
2439
  return false;
@@ -2402,9 +2441,9 @@ namespace GGUFMeta {
2402
2441
 
2403
2442
  template<typename OT>
2404
2443
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2405
- try_override(T & target, const struct llama_model_kv_override *override) {
2406
- if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
2407
- target = override->float_value;
2444
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2445
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2446
+ target = ovrd->float_value;
2408
2447
  return true;
2409
2448
  }
2410
2449
  return false;
@@ -2412,17 +2451,17 @@ namespace GGUFMeta {
2412
2451
 
2413
2452
  template<typename OT>
2414
2453
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2415
- try_override(T & target, const struct llama_model_kv_override *override) {
2454
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2416
2455
  (void)target;
2417
- (void)override;
2418
- if (!override) { return false; }
2456
+ (void)ovrd;
2457
+ if (!ovrd) { return false; }
2419
2458
  // Currently, we should never end up here so it would be a bug if we do.
2420
2459
  throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2421
- override ? override->key : "NULL"));
2460
+ ovrd ? ovrd->key : "NULL"));
2422
2461
  }
2423
2462
 
2424
- static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
2425
- if (try_override<T>(target, override)) {
2463
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2464
+ if (try_override<T>(target, ovrd)) {
2426
2465
  return true;
2427
2466
  }
2428
2467
  if (k < 0) { return false; }
@@ -2430,12 +2469,12 @@ namespace GGUFMeta {
2430
2469
  return true;
2431
2470
  }
2432
2471
 
2433
- static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
2434
- return set(ctx, gguf_find_key(ctx, key), target, override);
2472
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2473
+ return set(ctx, gguf_find_key(ctx, key), target, ovrd);
2435
2474
  }
2436
2475
 
2437
- static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
2438
- return set(ctx, key.c_str(), target, override);
2476
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2477
+ return set(ctx, key.c_str(), target, ovrd);
2439
2478
  }
2440
2479
  };
2441
2480
  }
@@ -2542,9 +2581,12 @@ struct llama_model_loader {
2542
2581
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2543
2582
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2544
2583
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2584
+ case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2545
2585
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
2586
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
2587
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2588
+ case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2589
+ case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
2548
2590
  default:
2549
2591
  {
2550
2592
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2845,6 +2887,15 @@ struct llama_model_loader {
2845
2887
  }
2846
2888
  };
2847
2889
 
2890
+ template<>
2891
+ bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2892
+ uint32_t tmp;
2893
+ const bool found = get_key(kid, tmp, required);
2894
+ result = (enum llama_pooling_type) tmp;
2895
+ return found;
2896
+ }
2897
+
2898
+
2848
2899
  //
2849
2900
  // load LLaMA models
2850
2901
  //
@@ -2886,10 +2937,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2886
2937
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2887
2938
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2888
2939
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2889
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2940
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
2941
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
2942
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
2890
2943
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2891
2944
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2892
2945
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2946
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
2947
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
2948
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
2893
2949
 
2894
2950
  default: return "unknown, may not work";
2895
2951
  }
@@ -2923,16 +2979,16 @@ static const char * llama_model_type_name(e_model type) {
2923
2979
  default: return "?B";
2924
2980
  }
2925
2981
  }
2982
+
2926
2983
  static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2927
2984
  switch (type) {
2928
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2929
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2930
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2931
- default: return "unknown";
2985
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2986
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2987
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2988
+ default: return "unknown";
2932
2989
  }
2933
2990
  }
2934
2991
 
2935
-
2936
2992
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2937
2993
  model.arch = ml.get_arch();
2938
2994
  if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -2996,7 +3052,7 @@ static void llm_load_hparams(
2996
3052
  std::string rope_scaling("linear");
2997
3053
  ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2998
3054
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2999
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
3055
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
3000
3056
 
3001
3057
  // rope_freq_scale (inverse of the kv) is optional
3002
3058
  float ropescale = 0.0f;
@@ -3109,10 +3165,10 @@ static void llm_load_hparams(
3109
3165
  } break;
3110
3166
  case LLM_ARCH_BERT:
3111
3167
  {
3112
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3113
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3168
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3169
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3114
3170
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3115
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3171
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3116
3172
 
3117
3173
  switch (hparams.n_layer) {
3118
3174
  case 3:
@@ -3130,10 +3186,10 @@ static void llm_load_hparams(
3130
3186
  } break;
3131
3187
  case LLM_ARCH_NOMIC_BERT:
3132
3188
  {
3133
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3134
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3189
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3190
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3135
3191
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3136
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3192
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3137
3193
 
3138
3194
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3139
3195
  model.type = e_model::MODEL_137M;
@@ -3272,6 +3328,8 @@ static void llm_load_hparams(
3272
3328
  if (hparams.f_max_alibi_bias > 0.0f) {
3273
3329
  hparams.need_kq_pos = true;
3274
3330
  }
3331
+
3332
+ hparams.rope_type = llama_rope_type(&model);
3275
3333
  }
3276
3334
 
3277
3335
  // TODO: This should probably be in llama.h
@@ -3574,6 +3632,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3574
3632
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3575
3633
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3576
3634
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3635
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3636
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3577
3637
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3578
3638
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3579
3639
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3640,7 +3700,7 @@ static bool llm_load_tensors(
3640
3700
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3641
3701
  }
3642
3702
 
3643
- if (split_mode == LLAMA_SPLIT_LAYER) {
3703
+ if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
3644
3704
  // calculate the split points
3645
3705
  int device_count = llama_get_device_count();
3646
3706
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3679,10 +3739,10 @@ static bool llm_load_tensors(
3679
3739
  }
3680
3740
  } else {
3681
3741
  ggml_backend_buffer_type_t split_buft;
3682
- if (split_mode == LLAMA_SPLIT_ROW) {
3742
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
3683
3743
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3684
3744
  } else {
3685
- // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3745
+ // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
3686
3746
  split_buft = llama_default_buffer_type_offload(main_gpu);
3687
3747
  }
3688
3748
  // assign the repeating layers
@@ -4595,12 +4655,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4595
4655
 
4596
4656
  using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
4597
4657
 
4598
- enum llm_rope_type {
4599
- LLM_ROPE,
4600
- LLM_ROPE_NEOX,
4601
- LLM_ROPE_GLM,
4602
- };
4603
-
4604
4658
  enum llm_ffn_op_type {
4605
4659
  LLM_FFN_SILU,
4606
4660
  LLM_FFN_GELU,
@@ -4646,55 +4700,6 @@ static struct ggml_tensor * llm_build_inp_embd(
4646
4700
  return inpL;
4647
4701
  }
4648
4702
 
4649
- // Persimmon: n_rot = n_embd_head_k/2
4650
- // Other: n_rot = n_embd_head_k
4651
- static void llm_build_k_shift(
4652
- struct ggml_context * ctx,
4653
- const llama_hparams & hparams,
4654
- const llama_cparams & cparams,
4655
- const llama_kv_cache & kv,
4656
- struct ggml_cgraph * graph,
4657
- struct ggml_tensor * K_shift,
4658
- llm_rope_type type,
4659
- int64_t n_ctx,
4660
- float freq_base,
4661
- float freq_scale,
4662
- const llm_build_cb & cb) {
4663
- const int64_t n_layer = hparams.n_layer;
4664
- const int64_t n_head_kv = hparams.n_head_kv;
4665
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
4666
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4667
- const int32_t n_rot = hparams.n_rot;
4668
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4669
- const float ext_factor = cparams.yarn_ext_factor;
4670
- const float attn_factor = cparams.yarn_attn_factor;
4671
- const float beta_fast = cparams.yarn_beta_fast;
4672
- const float beta_slow = cparams.yarn_beta_slow;
4673
-
4674
- int rope_type = 0;
4675
-
4676
- switch (type) {
4677
- case LLM_ROPE: rope_type = 0; break;
4678
- case LLM_ROPE_NEOX: rope_type = 2; break;
4679
- case LLM_ROPE_GLM: rope_type = 4; break;
4680
- }
4681
-
4682
- for (int il = 0; il < n_layer; ++il) {
4683
- struct ggml_tensor * tmp =
4684
- // we rotate only the first n_rot dimensions
4685
- ggml_rope_custom_inplace(ctx,
4686
- ggml_view_3d(ctx, kv.k_l[il],
4687
- n_embd_head_k, n_head_kv, n_ctx,
4688
- ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4689
- ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4690
- 0),
4691
- K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4692
- ext_factor, attn_factor, beta_fast, beta_slow);
4693
- cb(tmp, "K_shifted", il);
4694
- ggml_build_forward_expand(graph, tmp);
4695
- }
4696
- }
4697
-
4698
4703
  static void llm_build_kv_store(
4699
4704
  struct ggml_context * ctx,
4700
4705
  const llama_hparams & hparams,
@@ -4896,8 +4901,8 @@ static struct ggml_tensor * llm_build_kqv(
4896
4901
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4897
4902
  }
4898
4903
 
4899
- #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4900
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
4904
+ #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
4905
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
4901
4906
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4902
4907
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4903
4908
  if (hparams.f_max_alibi_bias > 0.0f) {
@@ -4998,6 +5003,7 @@ struct llm_build_context {
4998
5003
 
4999
5004
  const int64_t n_embd;
5000
5005
  const int64_t n_layer;
5006
+ const int64_t n_rot;
5001
5007
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
5002
5008
  const int64_t n_head;
5003
5009
  const int64_t n_head_kv;
@@ -5022,8 +5028,8 @@ struct llm_build_context {
5022
5028
  const int32_t kv_head; // index of where we store new KV data in the cache
5023
5029
  const int32_t n_orig_ctx;
5024
5030
 
5025
- const bool do_rope_shift;
5026
- const uint32_t pooling_type;
5031
+ const enum llama_pooling_type pooling_type;
5032
+ const enum llama_rope_type rope_type;
5027
5033
 
5028
5034
  const llm_build_cb & cb;
5029
5035
 
@@ -5045,6 +5051,7 @@ struct llm_build_context {
5045
5051
  kv_self (lctx.kv_self),
5046
5052
  n_embd (hparams.n_embd),
5047
5053
  n_layer (hparams.n_layer),
5054
+ n_rot (hparams.n_rot),
5048
5055
  n_ctx (cparams.n_ctx),
5049
5056
  n_head (hparams.n_head),
5050
5057
  n_head_kv (hparams.n_head_kv),
@@ -5066,8 +5073,8 @@ struct llm_build_context {
5066
5073
  n_kv (worst_case ? n_ctx : kv_self.n),
5067
5074
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5068
5075
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5069
- do_rope_shift (worst_case || kv_self.has_shift),
5070
- pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
5076
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
5077
+ rope_type (hparams.rope_type),
5071
5078
  cb (cb),
5072
5079
  buf_compute_meta (lctx.buf_compute_meta) {
5073
5080
  // all initializations should be done in init()
@@ -5090,6 +5097,76 @@ struct llm_build_context {
5090
5097
  }
5091
5098
  }
5092
5099
 
5100
+ struct ggml_cgraph * build_k_shift() {
5101
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5102
+
5103
+ for (int il = 0; il < n_layer; ++il) {
5104
+ struct ggml_tensor * tmp =
5105
+ // we rotate only the first n_rot dimensions
5106
+ ggml_rope_custom_inplace(ctx0,
5107
+ ggml_view_3d(ctx0, kv_self.k_l[il],
5108
+ n_embd_head_k, n_head_kv, n_ctx,
5109
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
5110
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5111
+ 0),
5112
+ lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5113
+ ext_factor, attn_factor, beta_fast, beta_slow);
5114
+ cb(tmp, "K_shifted", il);
5115
+ ggml_build_forward_expand(gf, tmp);
5116
+ }
5117
+
5118
+ return gf;
5119
+ }
5120
+
5121
+ struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5122
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5123
+
5124
+ for (uint32_t i = 0; i < ids.size(); ++i) {
5125
+ const uint32_t id = ids[i];
5126
+
5127
+ if (i == id || id == ids.size()) {
5128
+ continue;
5129
+ }
5130
+
5131
+ uint32_t nm = 1;
5132
+
5133
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
5134
+ nm++;
5135
+ }
5136
+
5137
+ for (int il = 0; il < n_layer; ++il) {
5138
+ ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
5139
+ n_embd_k_gqa, nm,
5140
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5141
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
5142
+
5143
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
5144
+ n_embd_k_gqa, nm,
5145
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5146
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
5147
+
5148
+ ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
5149
+ nm, n_embd_v_gqa,
5150
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5151
+ ggml_row_size(kv_self.v_l[il]->type, i));
5152
+
5153
+ ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
5154
+ nm, n_embd_v_gqa,
5155
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5156
+ ggml_row_size(kv_self.v_l[il]->type, id));
5157
+
5158
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
5159
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
5160
+ }
5161
+
5162
+ i += nm - 1;
5163
+ }
5164
+
5165
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
5166
+
5167
+ return gf;
5168
+ }
5169
+
5093
5170
  struct ggml_cgraph * build_llama() {
5094
5171
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5095
5172
 
@@ -5111,11 +5188,6 @@ struct llm_build_context {
5111
5188
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5112
5189
  cb(KQ_mask, "KQ_mask", -1);
5113
5190
 
5114
- // shift the entire K-cache if needed
5115
- if (do_rope_shift) {
5116
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5117
- }
5118
-
5119
5191
  for (int il = 0; il < n_layer; ++il) {
5120
5192
  struct ggml_tensor * inpSA = inpL;
5121
5193
 
@@ -5151,14 +5223,14 @@ struct llm_build_context {
5151
5223
 
5152
5224
  Qcur = ggml_rope_custom(
5153
5225
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5154
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5226
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5155
5227
  ext_factor, attn_factor, beta_fast, beta_slow
5156
5228
  );
5157
5229
  cb(Qcur, "Qcur", il);
5158
5230
 
5159
5231
  Kcur = ggml_rope_custom(
5160
5232
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5161
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5233
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5162
5234
  ext_factor, attn_factor, beta_fast, beta_slow
5163
5235
  );
5164
5236
  cb(Kcur, "Kcur", il);
@@ -5299,11 +5371,6 @@ struct llm_build_context {
5299
5371
  struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5300
5372
  cb(KQ_pos, "KQ_pos", -1);
5301
5373
 
5302
- // shift the entire K-cache if needed
5303
- if (do_rope_shift) {
5304
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5305
- }
5306
-
5307
5374
  for (int il = 0; il < n_layer; ++il) {
5308
5375
  struct ggml_tensor * inpSA = inpL;
5309
5376
 
@@ -5327,12 +5394,12 @@ struct llm_build_context {
5327
5394
  case MODEL_7B:
5328
5395
  Qcur = ggml_rope_custom(
5329
5396
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5330
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5397
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5331
5398
  ext_factor, attn_factor, beta_fast, beta_slow
5332
5399
  );
5333
5400
  Kcur = ggml_rope_custom(
5334
5401
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5335
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5402
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5336
5403
  ext_factor, attn_factor, beta_fast, beta_slow
5337
5404
  );
5338
5405
  break;
@@ -5417,11 +5484,6 @@ struct llm_build_context {
5417
5484
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5418
5485
  cb(KQ_mask, "KQ_mask", -1);
5419
5486
 
5420
- // shift the entire K-cache if needed
5421
- if (do_rope_shift) {
5422
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5423
- }
5424
-
5425
5487
  for (int il = 0; il < n_layer; ++il) {
5426
5488
  struct ggml_tensor * attn_norm;
5427
5489
 
@@ -5460,13 +5522,13 @@ struct llm_build_context {
5460
5522
 
5461
5523
  // using mode = 2 for neox mode
5462
5524
  Qcur = ggml_rope_custom(
5463
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5525
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5464
5526
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5465
5527
  );
5466
5528
  cb(Qcur, "Qcur", il);
5467
5529
 
5468
5530
  Kcur = ggml_rope_custom(
5469
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5531
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5470
5532
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5471
5533
  );
5472
5534
  cb(Kcur, "Kcur", il);
@@ -5636,10 +5698,6 @@ struct llm_build_context {
5636
5698
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5637
5699
  cb(KQ_mask, "KQ_mask", -1);
5638
5700
 
5639
- if (do_rope_shift) {
5640
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5641
- }
5642
-
5643
5701
  for (int il = 0; il < n_layer; ++il) {
5644
5702
  struct ggml_tensor * residual = inpL;
5645
5703
 
@@ -5697,7 +5755,7 @@ struct llm_build_context {
5697
5755
 
5698
5756
  // RoPE the first n_rot of q/k, pass the other half, and concat.
5699
5757
  struct ggml_tensor * qrot = ggml_view_3d(
5700
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5758
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5701
5759
  ggml_element_size(tmpq) * n_embd_head,
5702
5760
  ggml_element_size(tmpq) * n_embd_head * n_head,
5703
5761
  0
@@ -5705,7 +5763,7 @@ struct llm_build_context {
5705
5763
  cb(qrot, "qrot", il);
5706
5764
 
5707
5765
  struct ggml_tensor * krot = ggml_view_3d(
5708
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5766
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5709
5767
  ggml_element_size(tmpk) * n_embd_head,
5710
5768
  ggml_element_size(tmpk) * n_embd_head * n_head,
5711
5769
  0
@@ -5714,29 +5772,29 @@ struct llm_build_context {
5714
5772
 
5715
5773
  // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5716
5774
  struct ggml_tensor * qpass = ggml_view_3d(
5717
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5775
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5718
5776
  ggml_element_size(tmpq) * n_embd_head,
5719
5777
  ggml_element_size(tmpq) * n_embd_head * n_head,
5720
- ggml_element_size(tmpq) * hparams.n_rot
5778
+ ggml_element_size(tmpq) * n_rot
5721
5779
  );
5722
5780
  cb(qpass, "qpass", il);
5723
5781
 
5724
5782
  struct ggml_tensor * kpass = ggml_view_3d(
5725
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5783
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5726
5784
  ggml_element_size(tmpk) * n_embd_head,
5727
5785
  ggml_element_size(tmpk) * n_embd_head * n_head,
5728
- ggml_element_size(tmpk) * hparams.n_rot
5786
+ ggml_element_size(tmpk) * n_rot
5729
5787
  );
5730
5788
  cb(kpass, "kpass", il);
5731
5789
 
5732
5790
  struct ggml_tensor * qrotated = ggml_rope_custom(
5733
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5791
+ ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5734
5792
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5735
5793
  );
5736
5794
  cb(qrotated, "qrotated", il);
5737
5795
 
5738
5796
  struct ggml_tensor * krotated = ggml_rope_custom(
5739
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5797
+ ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5740
5798
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5741
5799
  );
5742
5800
  cb(krotated, "krotated", il);
@@ -5988,14 +6046,14 @@ struct llm_build_context {
5988
6046
 
5989
6047
  Qcur = ggml_rope_custom(
5990
6048
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5991
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6049
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5992
6050
  ext_factor, attn_factor, beta_fast, beta_slow
5993
6051
  );
5994
6052
  cb(Qcur, "Qcur", il);
5995
6053
 
5996
6054
  Kcur = ggml_rope_custom(
5997
6055
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5998
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6056
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5999
6057
  ext_factor, attn_factor, beta_fast, beta_slow
6000
6058
  );
6001
6059
  cb(Kcur, "Kcur", il);
@@ -6047,12 +6105,12 @@ struct llm_build_context {
6047
6105
  cur = inpL;
6048
6106
 
6049
6107
  // pooling layer
6050
- if (pooling_type == LLAMA_POOLING_MEAN) {
6108
+ if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
6051
6109
  cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6052
- } else if (pooling_type == LLAMA_POOLING_CLS) {
6110
+ } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
6053
6111
  cur = ggml_get_rows(ctx0, cur, inp_cls);
6054
6112
  } else {
6055
- GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
6113
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
6056
6114
  }
6057
6115
  cb(cur, "result_embd", -1);
6058
6116
 
@@ -6284,11 +6342,6 @@ struct llm_build_context {
6284
6342
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6285
6343
  cb(KQ_mask, "KQ_mask", -1);
6286
6344
 
6287
- // shift the entire K-cache if needed
6288
- if (do_rope_shift) {
6289
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6290
- }
6291
-
6292
6345
  for (int il = 0; il < n_layer; ++il) {
6293
6346
  struct ggml_tensor * inpSA = inpL;
6294
6347
 
@@ -6325,14 +6378,14 @@ struct llm_build_context {
6325
6378
 
6326
6379
  Qcur = ggml_rope_custom(
6327
6380
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6328
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6381
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6329
6382
  ext_factor, attn_factor, beta_fast, beta_slow
6330
6383
  );
6331
6384
  cb(Qcur, "Qcur", il);
6332
6385
 
6333
6386
  Kcur = ggml_rope_custom(
6334
6387
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6335
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6388
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6336
6389
  ext_factor, attn_factor, beta_fast, beta_slow
6337
6390
  );
6338
6391
  cb(Kcur, "Kcur", il);
@@ -6407,11 +6460,6 @@ struct llm_build_context {
6407
6460
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6408
6461
  cb(KQ_mask, "KQ_mask", -1);
6409
6462
 
6410
- // shift the entire K-cache if needed
6411
- if (do_rope_shift) {
6412
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6413
- }
6414
-
6415
6463
  for (int il = 0; il < n_layer; ++il) {
6416
6464
  struct ggml_tensor * inpSA = inpL;
6417
6465
 
@@ -6441,13 +6489,13 @@ struct llm_build_context {
6441
6489
 
6442
6490
  // using mode = 2 for neox mode
6443
6491
  Qcur = ggml_rope_custom(
6444
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6492
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6445
6493
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6446
6494
  );
6447
6495
  cb(Qcur, "Qcur", il);
6448
6496
 
6449
6497
  Kcur = ggml_rope_custom(
6450
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6498
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6451
6499
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6452
6500
  );
6453
6501
  cb(Kcur, "Kcur", il);
@@ -6521,11 +6569,6 @@ struct llm_build_context {
6521
6569
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6522
6570
  cb(KQ_mask, "KQ_mask", -1);
6523
6571
 
6524
- // shift the entire K-cache if needed
6525
- if (do_rope_shift) {
6526
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6527
- }
6528
-
6529
6572
  for (int il = 0; il < n_layer; ++il) {
6530
6573
  struct ggml_tensor * inpSA = inpL;
6531
6574
 
@@ -6561,14 +6604,14 @@ struct llm_build_context {
6561
6604
 
6562
6605
  Qcur = ggml_rope_custom(
6563
6606
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6564
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6607
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6565
6608
  ext_factor, attn_factor, beta_fast, beta_slow
6566
6609
  );
6567
6610
  cb(Qcur, "Qcur", il);
6568
6611
 
6569
6612
  Kcur = ggml_rope_custom(
6570
6613
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6571
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6614
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6572
6615
  ext_factor, attn_factor, beta_fast, beta_slow
6573
6616
  );
6574
6617
  cb(Kcur, "Kcur", il);
@@ -6642,11 +6685,6 @@ struct llm_build_context {
6642
6685
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6643
6686
  cb(KQ_mask, "KQ_mask", -1);
6644
6687
 
6645
- // shift the entire K-cache if needed
6646
- if (do_rope_shift) {
6647
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6648
- }
6649
-
6650
6688
  for (int il = 0; il < n_layer; ++il) {
6651
6689
  attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
6652
6690
  model.layers[il].attn_norm,
@@ -6684,7 +6722,7 @@ struct llm_build_context {
6684
6722
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6685
6723
 
6686
6724
  Qcur = ggml_rope_custom(
6687
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6725
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6688
6726
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6689
6727
  );
6690
6728
  cb(Qcur, "Qcur", il);
@@ -6695,7 +6733,7 @@ struct llm_build_context {
6695
6733
  cb(Qcur, "Qcur", il);
6696
6734
 
6697
6735
  Kcur = ggml_rope_custom(
6698
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6736
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6699
6737
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6700
6738
  );
6701
6739
  cb(Kcur, "Kcur", il);
@@ -6764,11 +6802,6 @@ struct llm_build_context {
6764
6802
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6765
6803
  cb(KQ_mask, "KQ_mask", -1);
6766
6804
 
6767
- // shift the entire K-cache if needed
6768
- if (do_rope_shift) {
6769
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6770
- }
6771
-
6772
6805
  for (int il = 0; il < n_layer; ++il) {
6773
6806
 
6774
6807
  // norm
@@ -6792,14 +6825,14 @@ struct llm_build_context {
6792
6825
  cb(Vcur, "Vcur", il);
6793
6826
 
6794
6827
  Qcur = ggml_rope_custom(
6795
- ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
6796
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6828
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
6829
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6797
6830
  ext_factor, attn_factor, beta_fast, beta_slow);
6798
6831
  cb(Qcur, "Qcur", il);
6799
6832
 
6800
6833
  Kcur = ggml_rope_custom(
6801
- ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
6802
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6834
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
6835
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6803
6836
  ext_factor, attn_factor, beta_fast, beta_slow);
6804
6837
  cb(Kcur, "Kcur", il);
6805
6838
 
@@ -6969,11 +7002,6 @@ struct llm_build_context {
6969
7002
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6970
7003
  cb(KQ_mask, "KQ_mask", -1);
6971
7004
 
6972
- // shift the entire K-cache if needed
6973
- if (do_rope_shift) {
6974
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6975
- }
6976
-
6977
7005
  for (int il = 0; il < n_layer; ++il) {
6978
7006
  cur = llm_build_norm(ctx0, inpL, hparams,
6979
7007
  model.layers[il].attn_norm,
@@ -6999,14 +7027,14 @@ struct llm_build_context {
6999
7027
 
7000
7028
  struct ggml_tensor * Qcur = ggml_rope_custom(
7001
7029
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
7002
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7030
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7003
7031
  ext_factor, attn_factor, beta_fast, beta_slow
7004
7032
  );
7005
7033
  cb(Qcur, "Qcur", il);
7006
7034
 
7007
7035
  struct ggml_tensor * Kcur = ggml_rope_custom(
7008
7036
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
7009
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7037
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7010
7038
  ext_factor, attn_factor, beta_fast, beta_slow
7011
7039
  );
7012
7040
  cb(Kcur, "Kcur", il);
@@ -7077,11 +7105,6 @@ struct llm_build_context {
7077
7105
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7078
7106
  cb(KQ_mask, "KQ_mask", -1);
7079
7107
 
7080
- // shift the entire K-cache if needed
7081
- if (do_rope_shift) {
7082
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7083
- }
7084
-
7085
7108
  for (int il = 0; il < n_layer; ++il) {
7086
7109
  struct ggml_tensor * inpSA = inpL;
7087
7110
 
@@ -7117,14 +7140,14 @@ struct llm_build_context {
7117
7140
 
7118
7141
  Qcur = ggml_rope_custom(
7119
7142
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7120
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7143
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7121
7144
  ext_factor, attn_factor, beta_fast, beta_slow
7122
7145
  );
7123
7146
  cb(Qcur, "Qcur", il);
7124
7147
 
7125
7148
  Kcur = ggml_rope_custom(
7126
7149
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7127
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7150
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7128
7151
  ext_factor, attn_factor, beta_fast, beta_slow
7129
7152
  );
7130
7153
  cb(Kcur, "Kcur", il);
@@ -7196,11 +7219,6 @@ struct llm_build_context {
7196
7219
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7197
7220
  cb(KQ_mask, "KQ_mask", -1);
7198
7221
 
7199
- // shift the entire K-cache if needed
7200
- if (do_rope_shift) {
7201
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7202
- }
7203
-
7204
7222
  for (int il = 0; il < n_layer; ++il) {
7205
7223
  struct ggml_tensor * inpSA = inpL;
7206
7224
 
@@ -7236,14 +7254,14 @@ struct llm_build_context {
7236
7254
 
7237
7255
  Qcur = ggml_rope_custom(
7238
7256
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7239
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7257
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7240
7258
  ext_factor, attn_factor, beta_fast, beta_slow
7241
7259
  );
7242
7260
  cb(Qcur, "Qcur", il);
7243
7261
 
7244
7262
  Kcur = ggml_rope_custom(
7245
7263
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7246
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7264
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7265
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7266
  );
7249
7267
  cb(Kcur, "Kcur", il);
@@ -7328,11 +7346,6 @@ struct llm_build_context {
7328
7346
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7329
7347
  cb(KQ_mask, "KQ_mask", -1);
7330
7348
 
7331
- // shift the entire K-cache if needed
7332
- if (do_rope_shift) {
7333
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7334
- }
7335
-
7336
7349
  for (int il = 0; il < n_layer; ++il) {
7337
7350
  struct ggml_tensor * inpSA = inpL;
7338
7351
 
@@ -7368,14 +7381,14 @@ struct llm_build_context {
7368
7381
 
7369
7382
  Qcur = ggml_rope_custom(
7370
7383
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7371
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7384
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7372
7385
  ext_factor, attn_factor, beta_fast, beta_slow
7373
7386
  );
7374
7387
  cb(Qcur, "Qcur", il);
7375
7388
 
7376
7389
  Kcur = ggml_rope_custom(
7377
7390
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7378
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7391
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7379
7392
  ext_factor, attn_factor, beta_fast, beta_slow
7380
7393
  );
7381
7394
  cb(Kcur, "Kcur", il);
@@ -7464,11 +7477,6 @@ struct llm_build_context {
7464
7477
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7465
7478
  cb(KQ_mask, "KQ_mask", -1);
7466
7479
 
7467
- // shift the entire K-cache if needed
7468
- if (do_rope_shift) {
7469
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7470
- }
7471
-
7472
7480
  for (int il = 0; il < n_layer; ++il) {
7473
7481
 
7474
7482
  // norm
@@ -7491,7 +7499,7 @@ struct llm_build_context {
7491
7499
 
7492
7500
  Qcur = ggml_rope_custom(
7493
7501
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7494
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7502
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7495
7503
  ext_factor, attn_factor, beta_fast, beta_slow);
7496
7504
  cb(Qcur, "Qcur", il);
7497
7505
 
@@ -7500,7 +7508,7 @@ struct llm_build_context {
7500
7508
 
7501
7509
  Kcur = ggml_rope_custom(
7502
7510
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7503
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7511
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7504
7512
  ext_factor, attn_factor, beta_fast, beta_slow);
7505
7513
  cb(Kcur, "Kcur", il);
7506
7514
 
@@ -7553,6 +7561,40 @@ struct llm_build_context {
7553
7561
  }
7554
7562
  };
7555
7563
 
7564
+ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
7565
+ llama_batch dummy;
7566
+ dummy.n_tokens = 0;
7567
+
7568
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7569
+
7570
+ struct llm_build_context llm(lctx, dummy, cb, false);
7571
+
7572
+ llm.init();
7573
+
7574
+ struct ggml_cgraph * result = llm.build_defrag(ids);
7575
+
7576
+ llm.free();
7577
+
7578
+ return result;
7579
+ }
7580
+
7581
+ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7582
+ llama_batch dummy;
7583
+ dummy.n_tokens = 0;
7584
+
7585
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7586
+
7587
+ struct llm_build_context llm(lctx, dummy, cb, false);
7588
+
7589
+ llm.init();
7590
+
7591
+ struct ggml_cgraph * result = llm.build_k_shift();
7592
+
7593
+ llm.free();
7594
+
7595
+ return result;
7596
+ }
7597
+
7556
7598
  static struct ggml_cgraph * llama_build_graph(
7557
7599
  llama_context & lctx,
7558
7600
  const llama_batch & batch,
@@ -7672,6 +7714,20 @@ static struct ggml_cgraph * llama_build_graph(
7672
7714
  return result;
7673
7715
  }
7674
7716
 
7717
+ static void llama_set_k_shift(llama_context & lctx) {
7718
+ const auto & cparams = lctx.cparams;
7719
+
7720
+ const int64_t n_ctx = cparams.n_ctx;
7721
+
7722
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7723
+
7724
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7725
+
7726
+ for (int i = 0; i < n_ctx; ++i) {
7727
+ data[i] = lctx.kv_self.cells[i].delta;
7728
+ }
7729
+ }
7730
+
7675
7731
  static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7676
7732
  //
7677
7733
  // set input data
@@ -7739,19 +7795,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7739
7795
  }
7740
7796
  }
7741
7797
 
7742
- if (kv_self.has_shift) {
7743
- const int64_t n_ctx = cparams.n_ctx;
7744
-
7745
- assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7746
-
7747
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7748
-
7749
- for (int i = 0; i < n_ctx; ++i) {
7750
- data[i] = lctx.kv_self.cells[i].delta;
7751
- }
7752
- }
7753
-
7754
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
7798
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7755
7799
  const int64_t n_tokens = batch.n_tokens;
7756
7800
 
7757
7801
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@@ -7779,7 +7823,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7779
7823
  }
7780
7824
  }
7781
7825
 
7782
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7826
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7783
7827
  const int64_t n_tokens = batch.n_tokens;
7784
7828
 
7785
7829
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -7795,6 +7839,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7795
7839
  }
7796
7840
  }
7797
7841
 
7842
+ static void llama_graph_compute(
7843
+ llama_context & lctx,
7844
+ ggml_cgraph * gf,
7845
+ int n_threads) {
7846
+ #ifdef GGML_USE_MPI
7847
+ const int64_t n_layer = lctx.model.hparams.n_layer;
7848
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7849
+ #endif
7850
+
7851
+ #ifdef GGML_USE_METAL
7852
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
7853
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7854
+ }
7855
+ #endif
7856
+
7857
+ if (lctx.backend_cpu != nullptr) {
7858
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7859
+ }
7860
+
7861
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
7862
+
7863
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7864
+
7865
+ #ifdef GGML_USE_MPI
7866
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7867
+ #endif
7868
+ }
7869
+
7798
7870
  // decode a batch of tokens by evaluating the transformer
7799
7871
  //
7800
7872
  // - lctx: llama context
@@ -7821,9 +7893,9 @@ static int llama_decode_internal(
7821
7893
  const auto n_batch = cparams.n_batch;
7822
7894
 
7823
7895
  GGML_ASSERT(n_tokens <= n_batch);
7896
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7824
7897
 
7825
7898
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
7826
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7827
7899
 
7828
7900
  const int64_t t_start_us = ggml_time_us();
7829
7901
 
@@ -7872,6 +7944,8 @@ static int llama_decode_internal(
7872
7944
  batch.seq_id = seq_id_arr.data();
7873
7945
  }
7874
7946
 
7947
+ llama_kv_cache_update(&lctx);
7948
+
7875
7949
  // if we have enough unused cells before the current head ->
7876
7950
  // better to start searching from the beginning of the cache, hoping to fill it
7877
7951
  if (kv_self.head > kv_self.used + 2*n_tokens) {
@@ -7896,8 +7970,9 @@ static int llama_decode_internal(
7896
7970
  ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7897
7971
 
7898
7972
  // the output is always the last tensor in the graph
7899
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7973
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7900
7974
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7975
+
7901
7976
  if (strcmp(res->name, "result_output") == 0) {
7902
7977
  // the embeddings could be the second to last tensor, or the third to last tensor
7903
7978
  if (strcmp(embeddings->name, "result_norm") != 0) {
@@ -7924,40 +7999,12 @@ static int llama_decode_internal(
7924
7999
  n_threads = std::min(4, n_threads);
7925
8000
  }
7926
8001
 
7927
- #ifdef GGML_USE_MPI
7928
- const int64_t n_layer = hparams.n_layer;
7929
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7930
- #endif
7931
-
7932
- #ifdef GGML_USE_METAL
7933
- if (ggml_backend_is_metal(lctx.backend_metal)) {
7934
- ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7935
- }
7936
- #endif
7937
-
7938
- if (lctx.backend_cpu != nullptr) {
7939
- ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7940
- }
7941
-
7942
8002
  llama_set_inputs(lctx, batch);
7943
8003
 
7944
- ggml_backend_sched_graph_compute(lctx.sched, gf);
7945
-
7946
- // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7947
-
7948
- #ifdef GGML_USE_MPI
7949
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7950
- #endif
8004
+ llama_graph_compute(lctx, gf, n_threads);
7951
8005
 
7952
8006
  // update the kv ring buffer
7953
8007
  {
7954
- if (kv_self.has_shift) {
7955
- kv_self.has_shift = false;
7956
- for (uint32_t i = 0; i < kv_self.size; ++i) {
7957
- kv_self.cells[i].delta = 0;
7958
- }
7959
- }
7960
-
7961
8008
  kv_self.head += n_tokens;
7962
8009
 
7963
8010
  // Ensure kv cache head points to a valid index.
@@ -7966,6 +8013,18 @@ static int llama_decode_internal(
7966
8013
  }
7967
8014
  }
7968
8015
 
8016
+ // decide if we need to defrag the kv cache
8017
+ if (cparams.defrag_thold >= 0.0f) {
8018
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8019
+
8020
+ // queue defragmentation for next llama_kv_cache_update
8021
+ if (fragmentation > cparams.defrag_thold) {
8022
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8023
+
8024
+ llama_kv_cache_defrag(kv_self);
8025
+ }
8026
+ }
8027
+
7969
8028
  #ifdef GGML_PERF
7970
8029
  // print timing information per ggml operation (for debugging purposes)
7971
8030
  // requires GGML_PERF to be defined
@@ -8053,6 +8112,245 @@ static int llama_decode_internal(
8053
8112
  return 0;
8054
8113
  }
8055
8114
 
8115
+ // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
8116
+ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8117
+ auto & kv_self = lctx.kv_self;
8118
+
8119
+ const auto & hparams = lctx.model.hparams;
8120
+
8121
+ const uint32_t n_layer = hparams.n_layer;
8122
+
8123
+ const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8124
+ const uint32_t n_used = kv_self.used;
8125
+
8126
+ assert(n_used <= n_kv);
8127
+
8128
+ //const int64_t t_start = ggml_time_us();
8129
+
8130
+ // number of cells moved
8131
+ uint32_t n_moves = 0;
8132
+
8133
+ // determine which KV cells to move where
8134
+ //
8135
+ // cell i moves to ids[i]
8136
+ //
8137
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
8138
+ //
8139
+ std::vector<uint32_t> ids(n_kv, n_kv);
8140
+
8141
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
8142
+ const auto & cell0 = kv_self.cells[i0];
8143
+
8144
+ if (!cell0.is_empty()) {
8145
+ ids[i0] = i0;
8146
+
8147
+ continue;
8148
+ }
8149
+
8150
+ // found a hole - fill it with data from the end of the cache
8151
+
8152
+ uint32_t nh = 1;
8153
+
8154
+ // determine the size of the hole
8155
+ while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
8156
+ nh++;
8157
+ }
8158
+
8159
+ // each move requires 6*n_layer tensors (see build_defrag)
8160
+ // - source view, destination view, copy operation
8161
+ // - x2 for keys and values
8162
+ //
8163
+ if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8164
+ // the graph is too big, we cannot move more cells
8165
+ break;
8166
+ }
8167
+
8168
+ uint32_t nf = 0;
8169
+ uint32_t is = n_kv - 1;
8170
+
8171
+ // starting from the end, find nh non-empty cells
8172
+ for (; is > i0; --is) {
8173
+ const auto & cell1 = kv_self.cells[is];
8174
+
8175
+ if (cell1.is_empty() || ids[is] != n_kv) {
8176
+ continue;
8177
+ }
8178
+
8179
+ // non-empty cell which is not yet moved
8180
+ nf++;
8181
+
8182
+ if (nf == nh) {
8183
+ break;
8184
+ }
8185
+ }
8186
+
8187
+ // this can only happen if `n_used` is not accurate, which would be a bug
8188
+ GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
8189
+
8190
+ nf = 0;
8191
+
8192
+ uint32_t i1 = is;
8193
+
8194
+ // are we moving a continuous block of memory?
8195
+ bool cont = false;
8196
+
8197
+ // go back and move the nf cells to the hole
8198
+ for (; i1 < n_kv; ++i1) {
8199
+ auto & cell1 = kv_self.cells[i1];
8200
+
8201
+ if (cell1.is_empty() || ids[i1] != n_kv) {
8202
+ cont = false;
8203
+ continue;
8204
+ }
8205
+
8206
+ // this cell goes to (i0 + nf)
8207
+ ids[i1] = i0 + nf;
8208
+
8209
+ // move the cell meta data
8210
+ kv_self.cells[i0 + nf] = cell1;
8211
+
8212
+ // clear the old cell and move the head there
8213
+ cell1 = llama_kv_cell();
8214
+ kv_self.head = n_used;
8215
+
8216
+ if (!cont) {
8217
+ n_moves++;
8218
+ cont = true;
8219
+ }
8220
+
8221
+ nf++;
8222
+
8223
+ if (nf == nh) {
8224
+ break;
8225
+ }
8226
+ }
8227
+
8228
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
8229
+
8230
+ i0 += nh - 1;
8231
+ }
8232
+
8233
+ if (n_moves == 0) {
8234
+ return;
8235
+ }
8236
+
8237
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
8238
+
8239
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
8240
+
8241
+ #if 0
8242
+ // CPU defrag
8243
+ //
8244
+ // TODO: optimizations are possible:
8245
+ // - multiple threads
8246
+ // - avoid copying to the host memory when already there
8247
+ //
8248
+ // likely not worth the effort, as we have ggml_graph based defrag
8249
+ //
8250
+
8251
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8252
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8253
+
8254
+ const uint32_t kv_size = kv_self.size;
8255
+
8256
+ std::vector<uint8_t> buf_k;
8257
+ std::vector<uint8_t> buf_v;
8258
+
8259
+ for (uint32_t il = 0; il < n_layer; ++il) {
8260
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
8261
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
8262
+
8263
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
8264
+ const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
8265
+
8266
+ buf_k.resize(k_size);
8267
+ buf_v.resize(v_size);
8268
+
8269
+ ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8270
+ ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8271
+
8272
+ // batch move [i, i+nm) to [id, id+nm)
8273
+ // note: cells can move only to a lower index
8274
+ for (uint32_t i = 0; i < n_kv; ++i) {
8275
+ const uint32_t id = ids[i];
8276
+
8277
+ if (i == id || id == n_kv) {
8278
+ continue;
8279
+ }
8280
+
8281
+ uint32_t nm = 1;
8282
+
8283
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
8284
+ nm++;
8285
+ }
8286
+
8287
+ // move keys
8288
+ {
8289
+ const int64_t os = i*k_size_row;
8290
+ const int64_t od = id*k_size_row;
8291
+
8292
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
8293
+ }
8294
+
8295
+ // move values (note: they are transposed)
8296
+ {
8297
+ const int64_t os = i;
8298
+ const int64_t od = id;
8299
+
8300
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
8301
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
8302
+ }
8303
+ }
8304
+
8305
+ i += nm - 1;
8306
+ }
8307
+
8308
+ ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8309
+ ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8310
+ }
8311
+ #else
8312
+ // ggml_graph defrag
8313
+
8314
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8315
+
8316
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8317
+ #endif
8318
+
8319
+ //const int64_t t_end = ggml_time_us();
8320
+
8321
+ //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
8322
+ }
8323
+
8324
+ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
8325
+ // apply K-shift if needed
8326
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
8327
+ llama_set_k_shift(lctx);
8328
+
8329
+ {
8330
+ ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
8331
+
8332
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8333
+ }
8334
+
8335
+ {
8336
+ auto & kv_self = lctx.kv_self;
8337
+
8338
+ kv_self.has_shift = false;
8339
+
8340
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
8341
+ kv_self.cells[i].delta = 0;
8342
+ }
8343
+ }
8344
+ }
8345
+
8346
+ // defragment the KV cache if needed
8347
+ if (lctx.kv_self.do_defrag) {
8348
+ llama_kv_cache_defrag_internal(lctx);
8349
+
8350
+ lctx.kv_self.do_defrag = false;
8351
+ }
8352
+ }
8353
+
8056
8354
  //
8057
8355
  // tokenizer
8058
8356
  //
@@ -8644,37 +8942,46 @@ struct llm_tokenizer_wpm {
8644
8942
  }
8645
8943
 
8646
8944
  std::vector<std::string> preprocess(const std::string & text) {
8647
- std::string ori_str = normalize(text);
8648
- uint64_t ori_size = ori_str.size();
8945
+ // normalalization form D
8946
+ std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
8947
+ std::vector<uint32_t> nfd_codepoints;
8948
+ for (uint32_t code : codepoints) {
8949
+ auto it = nfd_map.equal_range(code);
8950
+ if (it.first != it.second) {
8951
+ for (auto jt = it.first; jt != it.second; jt++) {
8952
+ nfd_codepoints.push_back(jt->second);
8953
+ }
8954
+ } else {
8955
+ nfd_codepoints.push_back(code);
8956
+ }
8957
+ }
8649
8958
 
8650
- // single punct / single symbol / single digit
8651
- // baseline: add whitespace on the left and right of punct and chinese characters
8652
- std::vector<std::string> words;
8959
+ // strip accents, strip control, uniformize whitespace,
8960
+ // to lowercase, pad chinese characters, pad punctuation
8653
8961
  std::string new_str = "";
8654
- uint64_t i = 0;
8655
- while (i < ori_size) {
8656
- int utf_char_len = utf8_len(ori_str[i]);
8657
- if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8658
- new_str += " ";
8659
- new_str += ori_str[i];
8660
- new_str += " ";
8661
- i += 1;
8962
+ for (uint32_t code : nfd_codepoints) {
8963
+ int type = codepoint_type(code);
8964
+ if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
8965
+ continue;
8662
8966
  }
8663
- else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8967
+ code = to_lower(code);
8968
+ if (type == CODEPOINT_TYPE_WHITESPACE) {
8969
+ code = ' ';
8970
+ }
8971
+ std::string s = codepoint_to_utf8(code);
8972
+ if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
8664
8973
  new_str += " ";
8665
- new_str += ori_str.substr(i, 3);
8974
+ new_str += s;
8666
8975
  new_str += " ";
8667
- i += 3;
8668
- }
8669
- else {
8670
- new_str += ori_str[i];
8671
- i += 1;
8976
+ } else {
8977
+ new_str += s;
8672
8978
  }
8673
8979
  }
8674
8980
 
8675
8981
  // split by whitespace
8676
8982
  uint64_t l = 0;
8677
8983
  uint64_t r = 0;
8984
+ std::vector<std::string> words;
8678
8985
  while (r < new_str.size()) {
8679
8986
  // if is whitespace
8680
8987
  if (isspace(new_str[r])) {
@@ -8692,47 +8999,21 @@ struct llm_tokenizer_wpm {
8692
8999
  return words;
8693
9000
  }
8694
9001
 
8695
- std::string normalize(const std::string & text) {
8696
- // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8697
- std::string text2 = strip_accents(text);
8698
- for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8699
- char c = text2[i];
8700
- if (c >= 'A' && c <= 'Z') {
8701
- text2[i] = c - 'A' + 'a';
8702
- }
9002
+ uint32_t to_lower(uint32_t code) {
9003
+ static const std::locale locale("en_US.UTF-8");
9004
+ #if defined(_WIN32)
9005
+ if (code > 0xFFFF) {
9006
+ return code;
8703
9007
  }
8704
- return text2;
9008
+ #endif
9009
+ return std::tolower(wchar_t(code), locale);
8705
9010
  }
8706
9011
 
8707
- bool is_chinese_char(const std::string & str) {
8708
- int len = str.length();
8709
- unsigned int codepoint = 0;
8710
- int num_bytes = 0;
8711
- int i = 0;
8712
- unsigned char ch = static_cast<unsigned char>(str[i]);
8713
- if (ch <= 0x7f) {
8714
- codepoint = ch;
8715
- num_bytes = 1;
8716
- } else if ((ch >> 5) == 0x06) {
8717
- codepoint = ch & 0x1f;
8718
- num_bytes = 2;
8719
- } else if ((ch >> 4) == 0x0e) {
8720
- codepoint = ch & 0x0f;
8721
- num_bytes = 3;
8722
- } else if ((ch >> 3) == 0x1e) {
8723
- codepoint = ch & 0x07;
8724
- num_bytes = 4;
8725
- }
8726
- for (int j = 1; j < num_bytes; ++j) {
8727
- if (i + j >= len) {
8728
- return false; // incomplete UTF-8 character
8729
- }
8730
- unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8731
- if ((next_ch >> 6) != 0x02) {
8732
- return false; // invalid trailing byte
8733
- }
8734
- codepoint = (codepoint << 6) | (next_ch & 0x3f);
8735
- }
9012
+ bool is_ascii_punct(uint32_t code) {
9013
+ return code < 256 && ispunct(code);
9014
+ }
9015
+
9016
+ bool is_chinese_char(uint32_t codepoint) {
8736
9017
  if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8737
9018
  (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8738
9019
  (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
@@ -8748,41 +9029,6 @@ struct llm_tokenizer_wpm {
8748
9029
  return false;
8749
9030
  }
8750
9031
 
8751
- std::string strip_accents(const std::string & input_string) {
8752
- std::string resultString;
8753
- std::map<std::string, char> accent_map = {
8754
- {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8755
- {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8756
- {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8757
- {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8758
- {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8759
- {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8760
- {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8761
- {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8762
- {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8763
- };
8764
-
8765
- for (size_t i = 0; i < input_string.length();) {
8766
- int len = utf8_len(input_string[i]);
8767
- std::string curChar = input_string.substr(i, len);
8768
- auto iter = accent_map.find(curChar);
8769
- if (iter != accent_map.end()) {
8770
- resultString += iter->second;
8771
- } else {
8772
- resultString += curChar;
8773
- }
8774
- i += len;
8775
- }
8776
-
8777
- return resultString;
8778
- }
8779
-
8780
- static size_t utf8_len(char src) {
8781
- const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8782
- uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8783
- return lookup[highbits];
8784
- }
8785
-
8786
9032
  const llama_vocab & vocab;
8787
9033
  };
8788
9034
 
@@ -9816,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
9816
10062
  }
9817
10063
  }
9818
10064
 
9819
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
9820
- llama_sample_temp(ctx, candidates_p, temp);
9821
- }
9822
-
9823
10065
  void llama_sample_repetition_penalties(
9824
10066
  struct llama_context * ctx,
9825
10067
  llama_token_data_array * candidates,
@@ -9946,38 +10188,6 @@ void llama_sample_apply_guidance(
9946
10188
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9947
10189
  }
9948
10190
 
9949
- void llama_sample_classifier_free_guidance(
9950
- struct llama_context * ctx,
9951
- llama_token_data_array * candidates,
9952
- struct llama_context * guidance_ctx,
9953
- float scale) {
9954
- GGML_ASSERT(ctx);
9955
- int64_t t_start_sample_us;
9956
-
9957
- t_start_sample_us = ggml_time_us();
9958
- const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
9959
-
9960
- GGML_ASSERT(n_vocab == candidates->size);
9961
- GGML_ASSERT(!candidates->sorted);
9962
-
9963
- std::vector<float> logits_base(n_vocab);
9964
- for (size_t i = 0; i < n_vocab; ++i) {
9965
- logits_base[i] = candidates->data[i].logit;
9966
- }
9967
-
9968
- float * logits_guidance = llama_get_logits(guidance_ctx);
9969
-
9970
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9971
- llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
9972
- t_start_sample_us = ggml_time_us();
9973
-
9974
- for (size_t i = 0; i < n_vocab; ++i) {
9975
- candidates->data[i].logit = logits_base[i];
9976
- }
9977
-
9978
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9979
- }
9980
-
9981
10191
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
9982
10192
  GGML_ASSERT(ctx);
9983
10193
 
@@ -10508,31 +10718,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10508
10718
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10509
10719
  new_type = GGML_TYPE_Q8_0;
10510
10720
  }
10511
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10721
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10722
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10512
10723
  new_type = GGML_TYPE_Q5_K;
10513
10724
  }
10514
10725
  else if (new_type != GGML_TYPE_Q8_0) {
10515
10726
  new_type = GGML_TYPE_Q6_K;
10516
10727
  }
10517
10728
  } else if (name == "token_embd.weight") {
10518
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10729
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
10730
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10519
10731
  new_type = GGML_TYPE_Q2_K;
10520
10732
  }
10733
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10734
+ new_type = GGML_TYPE_IQ3_S;
10735
+ }
10521
10736
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10522
- new_type = GGML_TYPE_Q4_K;
10737
+ new_type = GGML_TYPE_IQ3_S;
10523
10738
  }
10524
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10739
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
10740
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10525
10741
  if (name.find("attn_v.weight") != std::string::npos) {
10526
10742
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10527
- else new_type = GGML_TYPE_Q2_K;
10743
+ else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10528
10744
  ++qs.i_attention_wv;
10529
10745
  }
10746
+ else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
10747
+ new_type = GGML_TYPE_Q4_K;
10748
+ }
10530
10749
  else if (name.find("ffn_down") != std::string::npos) {
10531
- if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
10750
+ if (qs.i_ffn_down < qs.n_ffn_down/8) {
10751
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10752
+ }
10532
10753
  ++qs.i_ffn_down;
10533
10754
  }
10534
10755
  else if (name.find("attn_output.weight") != std::string::npos) {
10535
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10756
+ if (qs.model.hparams.n_expert == 8) {
10757
+ new_type = GGML_TYPE_Q5_K;
10758
+ } else {
10759
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10760
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
10761
+ }
10536
10762
  }
10537
10763
  } else if (name.find("attn_v.weight") != std::string::npos) {
10538
10764
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
@@ -10542,13 +10768,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10542
10768
  new_type = GGML_TYPE_Q4_K;
10543
10769
  }
10544
10770
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10545
- new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
10771
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
10772
+ }
10773
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
10774
+ new_type = GGML_TYPE_Q4_K;
10775
+ }
10776
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10777
+ new_type = GGML_TYPE_Q4_K;
10778
+ }
10779
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
10780
+ new_type = GGML_TYPE_Q4_K;
10781
+ }
10782
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10783
+ new_type = GGML_TYPE_Q4_K;
10546
10784
  }
10547
10785
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
10548
10786
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10549
10787
  }
10550
10788
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10551
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
10789
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
10552
10790
  new_type = GGML_TYPE_Q5_K;
10553
10791
  }
10554
10792
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -10574,14 +10812,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10574
10812
  // TODO: explore better strategies
10575
10813
  new_type = GGML_TYPE_Q8_0;
10576
10814
  }
10577
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10578
- new_type = GGML_TYPE_Q2_K;
10815
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
10816
+ new_type = GGML_TYPE_IQ3_XXS;
10817
+ }
10818
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10819
+ new_type = GGML_TYPE_IQ2_S;
10820
+ }
10821
+ } else if (name.find("attn_q.weight") != std::string::npos) {
10822
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
10823
+ new_type = GGML_TYPE_IQ3_XXS;
10824
+ }
10825
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10826
+ new_type = GGML_TYPE_IQ2_S;
10579
10827
  }
10580
10828
  } else if (name.find("ffn_down") != std::string::npos) {
10581
10829
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
10582
10830
  int i_layer = info.first, n_layer = info.second;
10583
10831
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10584
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10832
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
10585
10833
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
10586
10834
  }
10587
10835
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
@@ -10592,6 +10840,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10592
10840
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
10593
10841
  : GGML_TYPE_Q3_K;
10594
10842
  }
10843
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
10844
+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
10845
+ new_type = GGML_TYPE_Q4_K;
10846
+ }
10595
10847
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
10596
10848
  new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
10597
10849
  }
@@ -10603,8 +10855,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10603
10855
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10604
10856
  }
10605
10857
  }
10606
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10607
- if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
10858
+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
10859
+ new_type = GGML_TYPE_Q5_K;
10608
10860
  }
10609
10861
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10610
10862
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
@@ -10621,39 +10873,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10621
10873
  } else if (name.find("attn_output.weight") != std::string::npos) {
10622
10874
  if (arch != LLM_ARCH_FALCON) {
10623
10875
  if (qs.model.hparams.n_expert == 8) {
10624
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10876
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10625
10877
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10626
- ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
10878
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
10879
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
10627
10880
  new_type = GGML_TYPE_Q5_K;
10628
10881
  }
10629
10882
  } else {
10630
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10631
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
10632
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
10633
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10883
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10884
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
10885
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
10886
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
10887
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
10634
10888
  }
10635
10889
  } else {
10636
10890
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10637
10891
  }
10638
10892
  }
10639
10893
  else if (name.find("attn_qkv.weight") != std::string::npos) {
10640
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10894
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10895
+ new_type = GGML_TYPE_Q4_K;
10896
+ }
10641
10897
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
10642
10898
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
10643
10899
  }
10644
10900
  else if (name.find("ffn_gate") != std::string::npos) {
10645
10901
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
10646
10902
  int i_layer = info.first, n_layer = info.second;
10647
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10648
- new_type = GGML_TYPE_Q2_K;
10903
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10904
+ new_type = GGML_TYPE_IQ3_XXS;
10649
10905
  }
10650
10906
  ++qs.i_ffn_gate;
10651
10907
  }
10652
10908
  else if (name.find("ffn_up") != std::string::npos) {
10653
10909
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
10654
10910
  int i_layer = info.first, n_layer = info.second;
10655
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10656
- new_type = GGML_TYPE_Q2_K;
10911
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10912
+ new_type = GGML_TYPE_IQ3_XXS;
10657
10913
  }
10658
10914
  ++qs.i_ffn_up;
10659
10915
  }
@@ -10671,9 +10927,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10671
10927
  //}
10672
10928
  bool convert_incompatible_tensor = false;
10673
10929
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10674
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10675
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10676
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10930
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
10931
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
10932
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
10677
10933
  int nx = tensor->ne[0];
10678
10934
  int ny = tensor->ne[1];
10679
10935
  if (nx % QK_K != 0) {
@@ -10687,13 +10943,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10687
10943
  switch (new_type) {
10688
10944
  case GGML_TYPE_IQ2_XXS:
10689
10945
  case GGML_TYPE_IQ2_XS:
10946
+ case GGML_TYPE_IQ2_S:
10690
10947
  case GGML_TYPE_IQ3_XXS:
10948
+ case GGML_TYPE_IQ3_S:
10691
10949
  case GGML_TYPE_IQ1_S:
10692
10950
  case GGML_TYPE_Q2_K:
10693
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
10694
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10695
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10696
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10951
+ case GGML_TYPE_Q3_K:
10952
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
10953
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10954
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10955
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10697
10956
  default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
10698
10957
  }
10699
10958
  LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
@@ -10719,7 +10978,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10719
10978
  // K-quants
10720
10979
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10721
10980
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10722
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
10981
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
10723
10982
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10724
10983
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10725
10984
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -10730,9 +10989,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10730
10989
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10731
10990
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10732
10991
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10992
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
10993
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
10733
10994
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10734
10995
  case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10735
10996
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10997
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
10998
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
10999
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
10736
11000
 
10737
11001
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10738
11002
  }
@@ -10862,7 +11126,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10862
11126
  quantize &= !params->only_copy;
10863
11127
 
10864
11128
  // do not quantize expert gating tensors
10865
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
11129
+ // NOTE: can't use LLM_TN here because the layer number is not known
11130
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10866
11131
 
10867
11132
  // do not quantize positional embeddings and token types (BERT)
10868
11133
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
@@ -10906,6 +11171,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10906
11171
  }
10907
11172
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10908
11173
  new_type == GGML_TYPE_IQ2_XS ||
11174
+ new_type == GGML_TYPE_IQ2_S ||
10909
11175
  new_type == GGML_TYPE_IQ1_S ||
10910
11176
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10911
11177
  LLAMA_LOG_ERROR("\n\n============================================================\n");
@@ -11327,7 +11593,7 @@ static int llama_apply_lora_from_file_internal(
11327
11593
  struct llama_model_params llama_model_default_params() {
11328
11594
  struct llama_model_params result = {
11329
11595
  /*.n_gpu_layers =*/ 0,
11330
- /*.split_mode =*/ LLAMA_SPLIT_LAYER,
11596
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11331
11597
  /*.main_gpu =*/ 0,
11332
11598
  /*.tensor_split =*/ nullptr,
11333
11599
  /*.progress_callback =*/ nullptr,
@@ -11353,7 +11619,7 @@ struct llama_context_params llama_context_default_params() {
11353
11619
  /*.n_batch =*/ 512,
11354
11620
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11355
11621
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11356
- /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
11622
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
11357
11623
  /*.rope_freq_base =*/ 0.0f,
11358
11624
  /*.rope_freq_scale =*/ 0.0f,
11359
11625
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11361,11 +11627,11 @@ struct llama_context_params llama_context_default_params() {
11361
11627
  /*.yarn_beta_fast =*/ 32.0f,
11362
11628
  /*.yarn_beta_slow =*/ 1.0f,
11363
11629
  /*.yarn_orig_ctx =*/ 0,
11630
+ /*.defrag_thold =*/ -1.0f,
11364
11631
  /*.cb_eval =*/ nullptr,
11365
11632
  /*.cb_eval_user_data =*/ nullptr,
11366
11633
  /*.type_k =*/ GGML_TYPE_F16,
11367
11634
  /*.type_v =*/ GGML_TYPE_F16,
11368
- /*.mul_mat_q =*/ true,
11369
11635
  /*.logits_all =*/ false,
11370
11636
  /*.embedding =*/ false,
11371
11637
  /*.offload_kqv =*/ true,
@@ -11421,15 +11687,6 @@ bool llama_supports_gpu_offload(void) {
11421
11687
  #endif
11422
11688
  }
11423
11689
 
11424
- // deprecated:
11425
- bool llama_mmap_supported(void) {
11426
- return llama_supports_mmap();
11427
- }
11428
-
11429
- bool llama_mlock_supported(void) {
11430
- return llama_supports_mlock();
11431
- }
11432
-
11433
11690
  void llama_backend_init(void) {
11434
11691
  ggml_time_init();
11435
11692
 
@@ -11525,7 +11782,7 @@ struct llama_context * llama_new_context_with_model(
11525
11782
  cparams.yarn_attn_factor = params.yarn_attn_factor;
11526
11783
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11527
11784
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11528
- cparams.mul_mat_q = params.mul_mat_q;
11785
+ cparams.defrag_thold = params.defrag_thold;
11529
11786
  cparams.offload_kqv = params.offload_kqv;
11530
11787
  cparams.do_pooling = params.do_pooling;
11531
11788
 
@@ -11541,16 +11798,16 @@ struct llama_context * llama_new_context_with_model(
11541
11798
  cparams.cb_eval_user_data = params.cb_eval_user_data;
11542
11799
 
11543
11800
  auto rope_scaling_type = params.rope_scaling_type;
11544
- if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
11801
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
11545
11802
  rope_scaling_type = hparams.rope_scaling_type_train;
11546
11803
  }
11547
11804
 
11548
- if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
11805
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
11549
11806
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11550
11807
  }
11551
11808
 
11552
11809
  if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11553
- cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
11810
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11554
11811
  }
11555
11812
 
11556
11813
  if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11584,8 +11841,8 @@ struct llama_context * llama_new_context_with_model(
11584
11841
  }
11585
11842
  #elif defined(GGML_USE_CUBLAS)
11586
11843
  if (model->n_gpu_layers > 0) {
11587
- // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
11588
- if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
11844
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
11845
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
11589
11846
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11590
11847
  if (backend == nullptr) {
11591
11848
  LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11594,7 +11851,7 @@ struct llama_context * llama_new_context_with_model(
11594
11851
  }
11595
11852
  ctx->backends.push_back(backend);
11596
11853
  } else {
11597
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
11854
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
11598
11855
  for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11599
11856
  ggml_backend_t backend = ggml_backend_cuda_init(device);
11600
11857
  if (backend == nullptr) {
@@ -11647,8 +11904,7 @@ struct llama_context * llama_new_context_with_model(
11647
11904
  }
11648
11905
  ctx->backends.push_back(ctx->backend_cpu);
11649
11906
 
11650
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
11651
- cparams.n_ctx, cparams.offload_kqv)) {
11907
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
11652
11908
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
11653
11909
  llama_free(ctx);
11654
11910
  return nullptr;
@@ -11727,7 +11983,7 @@ struct llama_context * llama_new_context_with_model(
11727
11983
  }
11728
11984
 
11729
11985
  // buffer used to store the computation graph and the tensor meta data
11730
- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
11986
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
11731
11987
 
11732
11988
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
11733
11989
 
@@ -11796,6 +12052,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
11796
12052
  return model->vocab.type;
11797
12053
  }
11798
12054
 
12055
+ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12056
+ switch (model->arch) {
12057
+ // these models do not use RoPE
12058
+ case LLM_ARCH_GPT2:
12059
+ case LLM_ARCH_GPTJ:
12060
+ case LLM_ARCH_GPTNEOX:
12061
+ case LLM_ARCH_MPT:
12062
+ case LLM_ARCH_REFACT:
12063
+ case LLM_ARCH_BLOOM:
12064
+ return LLAMA_ROPE_TYPE_NONE;
12065
+
12066
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
12067
+ case LLM_ARCH_LLAMA:
12068
+ case LLM_ARCH_BAICHUAN:
12069
+ case LLM_ARCH_STARCODER:
12070
+ case LLM_ARCH_PLAMO:
12071
+ case LLM_ARCH_CODESHELL:
12072
+ case LLM_ARCH_ORION:
12073
+ case LLM_ARCH_INTERNLM2:
12074
+ case LLM_ARCH_MINICPM:
12075
+ return LLAMA_ROPE_TYPE_NORM;
12076
+
12077
+ // the pairs of head values are offset by n_rot/2
12078
+ case LLM_ARCH_FALCON:
12079
+ case LLM_ARCH_PERSIMMON:
12080
+ case LLM_ARCH_BERT:
12081
+ case LLM_ARCH_NOMIC_BERT:
12082
+ case LLM_ARCH_STABLELM:
12083
+ case LLM_ARCH_QWEN:
12084
+ case LLM_ARCH_QWEN2:
12085
+ case LLM_ARCH_PHI2:
12086
+ case LLM_ARCH_GEMMA:
12087
+ return LLAMA_ROPE_TYPE_NEOX;
12088
+
12089
+ // all model arches should be listed explicitly here
12090
+ case LLM_ARCH_UNKNOWN:
12091
+ GGML_ASSERT(false && "unknown architecture");
12092
+ break;
12093
+ }
12094
+
12095
+ return LLAMA_ROPE_TYPE_NONE;
12096
+ }
12097
+
11799
12098
  int32_t llama_n_vocab(const struct llama_model * model) {
11800
12099
  return model->vocab.id_to_token.size();
11801
12100
  }
@@ -11898,15 +12197,6 @@ uint32_t llama_model_quantize(
11898
12197
  }
11899
12198
  }
11900
12199
 
11901
- int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11902
- try {
11903
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
11904
- } catch (const std::exception & err) {
11905
- LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
11906
- return 1;
11907
- }
11908
- }
11909
-
11910
12200
  int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11911
12201
  try {
11912
12202
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@@ -12038,12 +12328,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
12038
12328
  llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
12039
12329
  }
12040
12330
 
12041
- void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12331
+ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12042
12332
  if (delta == 0) {
12043
12333
  return;
12044
12334
  }
12045
12335
 
12046
- llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
12336
+ llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
12047
12337
  }
12048
12338
 
12049
12339
  void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
@@ -12054,6 +12344,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
12054
12344
  llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
12055
12345
  }
12056
12346
 
12347
+ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
12348
+ return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
12349
+ }
12350
+
12351
+ void llama_kv_cache_defrag(struct llama_context * ctx) {
12352
+ llama_kv_cache_defrag(ctx->kv_self);
12353
+ }
12354
+
12355
+ void llama_kv_cache_update(struct llama_context * ctx) {
12356
+ llama_kv_cache_update_internal(*ctx);
12357
+ }
12358
+
12359
+
12057
12360
  // Returns the *maximum* size of the state
12058
12361
  size_t llama_get_state_size(const struct llama_context * ctx) {
12059
12362
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -12180,10 +12483,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12180
12483
  const auto & hparams = ctx->model.hparams;
12181
12484
  const auto & cparams = ctx->cparams;
12182
12485
 
12183
- const auto n_layer = hparams.n_layer;
12184
- const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
12185
- const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
12186
- const auto n_ctx = cparams.n_ctx;
12486
+ const uint32_t n_layer = hparams.n_layer;
12487
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12488
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12489
+ const uint32_t n_ctx = cparams.n_ctx;
12187
12490
 
12188
12491
  const size_t kv_buf_size = kv_self.total_size();
12189
12492
  const uint32_t kv_head = kv_self.head;
@@ -12198,14 +12501,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12198
12501
  if (kv_buf_size) {
12199
12502
  std::vector<uint8_t> tmp_buf;
12200
12503
  for (int il = 0; il < (int) n_layer; ++il) {
12201
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12504
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12505
+
12202
12506
  tmp_buf.resize(k_size);
12203
12507
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12204
12508
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12205
12509
 
12206
12510
  // v is not contiguous, copy row by row
12207
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12208
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12511
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12512
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12513
+
12209
12514
  tmp_buf.resize(v_row_size);
12210
12515
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12211
12516
  ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
@@ -12238,8 +12543,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
12238
12543
  }
12239
12544
 
12240
12545
  // Sets the state reading from the specified source address
12241
- size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12242
- uint8_t * inp = src;
12546
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12547
+ const uint8_t * inp = src;
12243
12548
 
12244
12549
  // set rng
12245
12550
  {
@@ -12248,7 +12553,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12248
12553
 
12249
12554
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
12250
12555
 
12251
- std::string rng_str((char *)inp, rng_size); inp += rng_size;
12556
+ std::string rng_str((const char *)inp, rng_size); inp += rng_size;
12252
12557
 
12253
12558
  std::istringstream rng_ss(rng_str);
12254
12559
  rng_ss >> ctx->rng;
@@ -12292,10 +12597,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12292
12597
  const auto & hparams = ctx->model.hparams;
12293
12598
  const auto & cparams = ctx->cparams;
12294
12599
 
12295
- const int n_layer = hparams.n_layer;
12296
- const int n_embd_k_gqa = hparams.n_embd_k_gqa();
12297
- const int n_embd_v_gqa = hparams.n_embd_v_gqa();
12298
- const int n_ctx = cparams.n_ctx;
12600
+ const uint32_t n_layer = hparams.n_layer;
12601
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12602
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12603
+ const uint32_t n_ctx = cparams.n_ctx;
12299
12604
 
12300
12605
  size_t kv_buf_size;
12301
12606
  uint32_t kv_head;
@@ -12311,13 +12616,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12311
12616
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12312
12617
 
12313
12618
  for (int il = 0; il < (int) n_layer; ++il) {
12314
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12619
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12620
+
12315
12621
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12316
12622
  inp += k_size;
12317
12623
 
12318
12624
  // v is not contiguous, copy row by row
12319
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12320
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12625
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12626
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12627
+
12321
12628
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12322
12629
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
12323
12630
  inp += v_row_size;
@@ -12439,38 +12746,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
12439
12746
  return true;
12440
12747
  }
12441
12748
 
12442
- int llama_eval(
12443
- struct llama_context * ctx,
12444
- llama_token * tokens,
12445
- int32_t n_tokens,
12446
- int32_t n_past) {
12447
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12448
-
12449
- const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
12450
- if (ret < 0) {
12451
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12452
- }
12453
-
12454
- return ret;
12455
- }
12456
-
12457
- int llama_eval_embd(
12458
- struct llama_context * ctx,
12459
- float * embd,
12460
- int32_t n_tokens,
12461
- int32_t n_past) {
12462
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12463
-
12464
- llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
12465
-
12466
- const int ret = llama_decode_internal(*ctx, batch);
12467
- if (ret < 0) {
12468
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12469
- }
12470
-
12471
- return ret;
12472
- }
12473
-
12474
12749
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
12475
12750
  ctx->cparams.n_threads = n_threads;
12476
12751
  ctx->cparams.n_threads_batch = n_threads_batch;