llama_cpp 0.12.7 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,10 +68,12 @@
68
68
  #include <cstdio>
69
69
  #include <cstring>
70
70
  #include <ctime>
71
+ #include <cwctype>
71
72
  #include <forward_list>
72
73
  #include <fstream>
73
74
  #include <functional>
74
75
  #include <initializer_list>
76
+ #include <locale>
75
77
  #include <map>
76
78
  #include <memory>
77
79
  #include <mutex>
@@ -850,9 +852,9 @@ struct LLM_TN {
850
852
  //
851
853
 
852
854
  static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
853
- { LLAMA_ROPE_SCALING_NONE, "none" },
854
- { LLAMA_ROPE_SCALING_LINEAR, "linear" },
855
- { LLAMA_ROPE_SCALING_YARN, "yarn" },
855
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
856
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
857
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
856
858
  };
857
859
 
858
860
  static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
@@ -862,7 +864,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
862
864
  }
863
865
  }
864
866
 
865
- return LLAMA_ROPE_SCALING_UNSPECIFIED;
867
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
866
868
  }
867
869
 
868
870
  static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
@@ -1550,8 +1552,9 @@ static const size_t MiB = 1024*kiB;
1550
1552
  static const size_t GiB = 1024*MiB;
1551
1553
 
1552
1554
  struct llama_hparams {
1553
- bool vocab_only;
1554
- bool rope_finetuned;
1555
+ bool vocab_only;
1556
+ bool rope_finetuned;
1557
+
1555
1558
  uint32_t n_vocab;
1556
1559
  uint32_t n_ctx_train; // context size the model was trained on
1557
1560
  uint32_t n_embd;
@@ -1580,7 +1583,8 @@ struct llama_hparams {
1580
1583
  bool causal_attn = true;
1581
1584
  bool need_kq_pos = false;
1582
1585
 
1583
- uint32_t pooling_type = LLAMA_POOLING_NONE;
1586
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1587
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1584
1588
 
1585
1589
  bool operator!=(const llama_hparams & other) const {
1586
1590
  if (this->vocab_only != other.vocab_only) return true;
@@ -1639,8 +1643,8 @@ struct llama_cparams {
1639
1643
  float yarn_attn_factor;
1640
1644
  float yarn_beta_fast;
1641
1645
  float yarn_beta_slow;
1646
+ float defrag_thold;
1642
1647
 
1643
- bool mul_mat_q;
1644
1648
  bool offload_kqv;
1645
1649
  bool do_pooling;
1646
1650
 
@@ -1707,11 +1711,20 @@ struct llama_kv_cell {
1707
1711
  bool has_seq_id(const llama_seq_id & id) const {
1708
1712
  return seq_id.find(id) != seq_id.end();
1709
1713
  }
1714
+
1715
+ bool is_empty() const {
1716
+ return seq_id.empty();
1717
+ }
1718
+
1719
+ bool is_same_seq(const llama_kv_cell & other) const {
1720
+ return seq_id == other.seq_id;
1721
+ }
1710
1722
  };
1711
1723
 
1712
1724
  // ring-buffer of cached KV data
1713
1725
  struct llama_kv_cache {
1714
1726
  bool has_shift = false;
1727
+ bool do_defrag = false;
1715
1728
 
1716
1729
  // Note: The value of head isn't only used to optimize searching
1717
1730
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -1723,6 +1736,9 @@ struct llama_kv_cache {
1723
1736
  // computed before each graph build
1724
1737
  uint32_t n = 0;
1725
1738
 
1739
+ ggml_type type_k = GGML_TYPE_F16;
1740
+ ggml_type type_v = GGML_TYPE_F16;
1741
+
1726
1742
  std::vector<llama_kv_cell> cells;
1727
1743
 
1728
1744
  std::vector<struct ggml_tensor *> k_l; // per layer
@@ -1958,8 +1974,8 @@ struct llama_context {
1958
1974
  static bool llama_kv_cache_init(
1959
1975
  struct llama_kv_cache & cache,
1960
1976
  const llama_model & model,
1961
- ggml_type ktype,
1962
- ggml_type vtype,
1977
+ ggml_type type_k,
1978
+ ggml_type type_v,
1963
1979
  uint32_t n_ctx,
1964
1980
  bool offload) {
1965
1981
  const struct llama_hparams & hparams = model.hparams;
@@ -1974,6 +1990,9 @@ static bool llama_kv_cache_init(
1974
1990
  cache.size = n_ctx;
1975
1991
  cache.used = 0;
1976
1992
 
1993
+ cache.type_k = type_k;
1994
+ cache.type_v = type_v;
1995
+
1977
1996
  cache.cells.clear();
1978
1997
  cache.cells.resize(n_ctx);
1979
1998
 
@@ -2014,8 +2033,8 @@ static bool llama_kv_cache_init(
2014
2033
 
2015
2034
  for (int i = 0; i < (int) n_layer; i++) {
2016
2035
  struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
2017
- ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
2018
- ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
2036
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
2037
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
2019
2038
  ggml_format_name(k, "cache_k_l%d", i);
2020
2039
  ggml_format_name(v, "cache_v_l%d", i);
2021
2040
  cache.k_l.push_back(k);
@@ -2099,7 +2118,7 @@ static bool llama_kv_cache_find_slot(
2099
2118
  // find how many cells are currently in use
2100
2119
  static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2101
2120
  for (uint32_t i = cache.size - 1; i > 0; --i) {
2102
- if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
2121
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
2103
2122
  return i + 1;
2104
2123
  }
2105
2124
  }
@@ -2135,7 +2154,7 @@ static void llama_kv_cache_seq_rm(
2135
2154
  } else {
2136
2155
  continue;
2137
2156
  }
2138
- if (cache.cells[i].seq_id.empty()) {
2157
+ if (cache.cells[i].is_empty()) {
2139
2158
  // keep count of the number of used cells
2140
2159
  if (cache.cells[i].pos >= 0) cache.used--;
2141
2160
 
@@ -2186,7 +2205,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
2186
2205
  if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
2187
2206
  }
2188
2207
 
2189
- static void llama_kv_cache_seq_shift(
2208
+ static void llama_kv_cache_seq_add(
2190
2209
  struct llama_kv_cache & cache,
2191
2210
  llama_seq_id seq_id,
2192
2211
  llama_pos p0,
@@ -2204,10 +2223,14 @@ static void llama_kv_cache_seq_shift(
2204
2223
  cache.cells[i].delta += delta;
2205
2224
 
2206
2225
  if (cache.cells[i].pos < 0) {
2207
- if (!cache.cells[i].seq_id.empty()) cache.used--;
2226
+ if (!cache.cells[i].is_empty()) {
2227
+ cache.used--;
2228
+ }
2208
2229
  cache.cells[i].pos = -1;
2209
2230
  cache.cells[i].seq_id.clear();
2210
- if (new_head == cache.size) new_head = i;
2231
+ if (new_head == cache.size) {
2232
+ new_head = i;
2233
+ }
2211
2234
  }
2212
2235
  }
2213
2236
  }
@@ -2239,6 +2262,22 @@ static void llama_kv_cache_seq_div(
2239
2262
  }
2240
2263
  }
2241
2264
 
2265
+ static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
2266
+ llama_pos result = 0;
2267
+
2268
+ for (uint32_t i = 0; i < cache.size; ++i) {
2269
+ if (cache.cells[i].has_seq_id(seq_id)) {
2270
+ result = std::max(result, cache.cells[i].pos);
2271
+ }
2272
+ }
2273
+
2274
+ return result;
2275
+ }
2276
+
2277
+ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2278
+ cache.do_defrag = true;
2279
+ }
2280
+
2242
2281
  //
2243
2282
  // model loading and saving
2244
2283
  //
@@ -2310,7 +2349,7 @@ namespace GGUFMeta {
2310
2349
  }
2311
2350
  };
2312
2351
 
2313
- struct ArrayInfo{
2352
+ struct ArrayInfo {
2314
2353
  const gguf_type gt;
2315
2354
  const size_t length;
2316
2355
  const void * data;
@@ -2329,7 +2368,7 @@ namespace GGUFMeta {
2329
2368
  };
2330
2369
 
2331
2370
  template<typename T>
2332
- class GKV: public GKV_Base<T> {
2371
+ class GKV : public GKV_Base<T> {
2333
2372
  GKV() = delete;
2334
2373
 
2335
2374
  public:
@@ -2345,46 +2384,46 @@ namespace GGUFMeta {
2345
2384
 
2346
2385
  static const char * override_type_to_str(const llama_model_kv_override_type ty) {
2347
2386
  switch (ty) {
2348
- case LLAMA_KV_OVERRIDE_BOOL: return "bool";
2349
- case LLAMA_KV_OVERRIDE_INT: return "int";
2350
- case LLAMA_KV_OVERRIDE_FLOAT: return "float";
2387
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2388
+ case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2389
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2351
2390
  }
2352
2391
  return "unknown";
2353
2392
  }
2354
2393
 
2355
- static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
2356
- if (!override) { return false; }
2357
- if (override->tag == expected_type) {
2394
+ static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
2395
+ if (!ovrd) { return false; }
2396
+ if (ovrd->tag == expected_type) {
2358
2397
  LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
2359
- __func__, override_type_to_str(override->tag), override->key);
2360
- switch (override->tag) {
2361
- case LLAMA_KV_OVERRIDE_BOOL: {
2362
- LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2398
+ __func__, override_type_to_str(ovrd->tag), ovrd->key);
2399
+ switch (ovrd->tag) {
2400
+ case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2401
+ LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2363
2402
  } break;
2364
- case LLAMA_KV_OVERRIDE_INT: {
2365
- LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2403
+ case LLAMA_KV_OVERRIDE_TYPE_INT: {
2404
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2366
2405
  } break;
2367
- case LLAMA_KV_OVERRIDE_FLOAT: {
2368
- LLAMA_LOG_INFO("%.6f\n", override->float_value);
2406
+ case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2407
+ LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2369
2408
  } break;
2370
2409
  default:
2371
2410
  // Shouldn't be possible to end up here, but just in case...
2372
2411
  throw std::runtime_error(
2373
2412
  format("Unsupported attempt to override %s type for metadata key %s\n",
2374
- override_type_to_str(override->tag), override->key));
2413
+ override_type_to_str(ovrd->tag), ovrd->key));
2375
2414
  }
2376
2415
  return true;
2377
2416
  }
2378
2417
  LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
2379
- __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
2418
+ __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
2380
2419
  return false;
2381
2420
  }
2382
2421
 
2383
2422
  template<typename OT>
2384
2423
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2385
- try_override(OT & target, const struct llama_model_kv_override *override) {
2386
- if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
2387
- target = override->bool_value;
2424
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2425
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2426
+ target = ovrd->bool_value;
2388
2427
  return true;
2389
2428
  }
2390
2429
  return false;
@@ -2392,9 +2431,9 @@ namespace GGUFMeta {
2392
2431
 
2393
2432
  template<typename OT>
2394
2433
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2395
- try_override(OT & target, const struct llama_model_kv_override *override) {
2396
- if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
2397
- target = override->int_value;
2434
+ try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2435
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2436
+ target = ovrd->int_value;
2398
2437
  return true;
2399
2438
  }
2400
2439
  return false;
@@ -2402,9 +2441,9 @@ namespace GGUFMeta {
2402
2441
 
2403
2442
  template<typename OT>
2404
2443
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2405
- try_override(T & target, const struct llama_model_kv_override *override) {
2406
- if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
2407
- target = override->float_value;
2444
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2445
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2446
+ target = ovrd->float_value;
2408
2447
  return true;
2409
2448
  }
2410
2449
  return false;
@@ -2412,17 +2451,17 @@ namespace GGUFMeta {
2412
2451
 
2413
2452
  template<typename OT>
2414
2453
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2415
- try_override(T & target, const struct llama_model_kv_override *override) {
2454
+ try_override(T & target, const struct llama_model_kv_override * ovrd) {
2416
2455
  (void)target;
2417
- (void)override;
2418
- if (!override) { return false; }
2456
+ (void)ovrd;
2457
+ if (!ovrd) { return false; }
2419
2458
  // Currently, we should never end up here so it would be a bug if we do.
2420
2459
  throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2421
- override ? override->key : "NULL"));
2460
+ ovrd ? ovrd->key : "NULL"));
2422
2461
  }
2423
2462
 
2424
- static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
2425
- if (try_override<T>(target, override)) {
2463
+ static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2464
+ if (try_override<T>(target, ovrd)) {
2426
2465
  return true;
2427
2466
  }
2428
2467
  if (k < 0) { return false; }
@@ -2430,12 +2469,12 @@ namespace GGUFMeta {
2430
2469
  return true;
2431
2470
  }
2432
2471
 
2433
- static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
2434
- return set(ctx, gguf_find_key(ctx, key), target, override);
2472
+ static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2473
+ return set(ctx, gguf_find_key(ctx, key), target, ovrd);
2435
2474
  }
2436
2475
 
2437
- static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
2438
- return set(ctx, key.c_str(), target, override);
2476
+ static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
2477
+ return set(ctx, key.c_str(), target, ovrd);
2439
2478
  }
2440
2479
  };
2441
2480
  }
@@ -2542,9 +2581,12 @@ struct llama_model_loader {
2542
2581
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2543
2582
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2544
2583
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2584
+ case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2545
2585
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2546
2586
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2547
2587
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2588
+ case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2589
+ case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
2548
2590
  default:
2549
2591
  {
2550
2592
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2845,6 +2887,15 @@ struct llama_model_loader {
2845
2887
  }
2846
2888
  };
2847
2889
 
2890
+ template<>
2891
+ bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2892
+ uint32_t tmp;
2893
+ const bool found = get_key(kid, tmp, required);
2894
+ result = (enum llama_pooling_type) tmp;
2895
+ return found;
2896
+ }
2897
+
2898
+
2848
2899
  //
2849
2900
  // load LLaMA models
2850
2901
  //
@@ -2886,10 +2937,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2886
2937
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2887
2938
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2888
2939
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2889
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2940
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
2941
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
2942
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
2890
2943
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2891
2944
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2892
2945
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
2946
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
2947
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
2948
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
2893
2949
 
2894
2950
  default: return "unknown, may not work";
2895
2951
  }
@@ -2923,16 +2979,16 @@ static const char * llama_model_type_name(e_model type) {
2923
2979
  default: return "?B";
2924
2980
  }
2925
2981
  }
2982
+
2926
2983
  static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2927
2984
  switch (type) {
2928
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2929
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2930
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2931
- default: return "unknown";
2985
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2986
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2987
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2988
+ default: return "unknown";
2932
2989
  }
2933
2990
  }
2934
2991
 
2935
-
2936
2992
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2937
2993
  model.arch = ml.get_arch();
2938
2994
  if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -2996,7 +3052,7 @@ static void llm_load_hparams(
2996
3052
  std::string rope_scaling("linear");
2997
3053
  ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
2998
3054
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
2999
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
3055
+ GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
3000
3056
 
3001
3057
  // rope_freq_scale (inverse of the kv) is optional
3002
3058
  float ropescale = 0.0f;
@@ -3109,10 +3165,10 @@ static void llm_load_hparams(
3109
3165
  } break;
3110
3166
  case LLM_ARCH_BERT:
3111
3167
  {
3112
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3113
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3168
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3169
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3114
3170
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3115
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3171
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3116
3172
 
3117
3173
  switch (hparams.n_layer) {
3118
3174
  case 3:
@@ -3130,10 +3186,10 @@ static void llm_load_hparams(
3130
3186
  } break;
3131
3187
  case LLM_ARCH_NOMIC_BERT:
3132
3188
  {
3133
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3134
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3189
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3190
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3135
3191
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3136
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3192
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3137
3193
 
3138
3194
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3139
3195
  model.type = e_model::MODEL_137M;
@@ -3272,6 +3328,8 @@ static void llm_load_hparams(
3272
3328
  if (hparams.f_max_alibi_bias > 0.0f) {
3273
3329
  hparams.need_kq_pos = true;
3274
3330
  }
3331
+
3332
+ hparams.rope_type = llama_rope_type(&model);
3275
3333
  }
3276
3334
 
3277
3335
  // TODO: This should probably be in llama.h
@@ -3574,6 +3632,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3574
3632
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3575
3633
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3576
3634
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3635
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3636
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3577
3637
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3578
3638
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3579
3639
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -3640,7 +3700,7 @@ static bool llm_load_tensors(
3640
3700
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3641
3701
  }
3642
3702
 
3643
- if (split_mode == LLAMA_SPLIT_LAYER) {
3703
+ if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
3644
3704
  // calculate the split points
3645
3705
  int device_count = llama_get_device_count();
3646
3706
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
@@ -3679,10 +3739,10 @@ static bool llm_load_tensors(
3679
3739
  }
3680
3740
  } else {
3681
3741
  ggml_backend_buffer_type_t split_buft;
3682
- if (split_mode == LLAMA_SPLIT_ROW) {
3742
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
3683
3743
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3684
3744
  } else {
3685
- // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3745
+ // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
3686
3746
  split_buft = llama_default_buffer_type_offload(main_gpu);
3687
3747
  }
3688
3748
  // assign the repeating layers
@@ -4595,12 +4655,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4595
4655
 
4596
4656
  using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
4597
4657
 
4598
- enum llm_rope_type {
4599
- LLM_ROPE,
4600
- LLM_ROPE_NEOX,
4601
- LLM_ROPE_GLM,
4602
- };
4603
-
4604
4658
  enum llm_ffn_op_type {
4605
4659
  LLM_FFN_SILU,
4606
4660
  LLM_FFN_GELU,
@@ -4646,55 +4700,6 @@ static struct ggml_tensor * llm_build_inp_embd(
4646
4700
  return inpL;
4647
4701
  }
4648
4702
 
4649
- // Persimmon: n_rot = n_embd_head_k/2
4650
- // Other: n_rot = n_embd_head_k
4651
- static void llm_build_k_shift(
4652
- struct ggml_context * ctx,
4653
- const llama_hparams & hparams,
4654
- const llama_cparams & cparams,
4655
- const llama_kv_cache & kv,
4656
- struct ggml_cgraph * graph,
4657
- struct ggml_tensor * K_shift,
4658
- llm_rope_type type,
4659
- int64_t n_ctx,
4660
- float freq_base,
4661
- float freq_scale,
4662
- const llm_build_cb & cb) {
4663
- const int64_t n_layer = hparams.n_layer;
4664
- const int64_t n_head_kv = hparams.n_head_kv;
4665
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
4666
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4667
- const int32_t n_rot = hparams.n_rot;
4668
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4669
- const float ext_factor = cparams.yarn_ext_factor;
4670
- const float attn_factor = cparams.yarn_attn_factor;
4671
- const float beta_fast = cparams.yarn_beta_fast;
4672
- const float beta_slow = cparams.yarn_beta_slow;
4673
-
4674
- int rope_type = 0;
4675
-
4676
- switch (type) {
4677
- case LLM_ROPE: rope_type = 0; break;
4678
- case LLM_ROPE_NEOX: rope_type = 2; break;
4679
- case LLM_ROPE_GLM: rope_type = 4; break;
4680
- }
4681
-
4682
- for (int il = 0; il < n_layer; ++il) {
4683
- struct ggml_tensor * tmp =
4684
- // we rotate only the first n_rot dimensions
4685
- ggml_rope_custom_inplace(ctx,
4686
- ggml_view_3d(ctx, kv.k_l[il],
4687
- n_embd_head_k, n_head_kv, n_ctx,
4688
- ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4689
- ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4690
- 0),
4691
- K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4692
- ext_factor, attn_factor, beta_fast, beta_slow);
4693
- cb(tmp, "K_shifted", il);
4694
- ggml_build_forward_expand(graph, tmp);
4695
- }
4696
- }
4697
-
4698
4703
  static void llm_build_kv_store(
4699
4704
  struct ggml_context * ctx,
4700
4705
  const llama_hparams & hparams,
@@ -4896,8 +4901,8 @@ static struct ggml_tensor * llm_build_kqv(
4896
4901
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4897
4902
  }
4898
4903
 
4899
- #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4900
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
4904
+ #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
4905
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
4901
4906
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4902
4907
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4903
4908
  if (hparams.f_max_alibi_bias > 0.0f) {
@@ -4998,6 +5003,7 @@ struct llm_build_context {
4998
5003
 
4999
5004
  const int64_t n_embd;
5000
5005
  const int64_t n_layer;
5006
+ const int64_t n_rot;
5001
5007
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
5002
5008
  const int64_t n_head;
5003
5009
  const int64_t n_head_kv;
@@ -5022,8 +5028,8 @@ struct llm_build_context {
5022
5028
  const int32_t kv_head; // index of where we store new KV data in the cache
5023
5029
  const int32_t n_orig_ctx;
5024
5030
 
5025
- const bool do_rope_shift;
5026
- const uint32_t pooling_type;
5031
+ const enum llama_pooling_type pooling_type;
5032
+ const enum llama_rope_type rope_type;
5027
5033
 
5028
5034
  const llm_build_cb & cb;
5029
5035
 
@@ -5045,6 +5051,7 @@ struct llm_build_context {
5045
5051
  kv_self (lctx.kv_self),
5046
5052
  n_embd (hparams.n_embd),
5047
5053
  n_layer (hparams.n_layer),
5054
+ n_rot (hparams.n_rot),
5048
5055
  n_ctx (cparams.n_ctx),
5049
5056
  n_head (hparams.n_head),
5050
5057
  n_head_kv (hparams.n_head_kv),
@@ -5066,8 +5073,8 @@ struct llm_build_context {
5066
5073
  n_kv (worst_case ? n_ctx : kv_self.n),
5067
5074
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5068
5075
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5069
- do_rope_shift (worst_case || kv_self.has_shift),
5070
- pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
5076
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
5077
+ rope_type (hparams.rope_type),
5071
5078
  cb (cb),
5072
5079
  buf_compute_meta (lctx.buf_compute_meta) {
5073
5080
  // all initializations should be done in init()
@@ -5090,6 +5097,76 @@ struct llm_build_context {
5090
5097
  }
5091
5098
  }
5092
5099
 
5100
+ struct ggml_cgraph * build_k_shift() {
5101
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5102
+
5103
+ for (int il = 0; il < n_layer; ++il) {
5104
+ struct ggml_tensor * tmp =
5105
+ // we rotate only the first n_rot dimensions
5106
+ ggml_rope_custom_inplace(ctx0,
5107
+ ggml_view_3d(ctx0, kv_self.k_l[il],
5108
+ n_embd_head_k, n_head_kv, n_ctx,
5109
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
5110
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5111
+ 0),
5112
+ lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5113
+ ext_factor, attn_factor, beta_fast, beta_slow);
5114
+ cb(tmp, "K_shifted", il);
5115
+ ggml_build_forward_expand(gf, tmp);
5116
+ }
5117
+
5118
+ return gf;
5119
+ }
5120
+
5121
+ struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5122
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5123
+
5124
+ for (uint32_t i = 0; i < ids.size(); ++i) {
5125
+ const uint32_t id = ids[i];
5126
+
5127
+ if (i == id || id == ids.size()) {
5128
+ continue;
5129
+ }
5130
+
5131
+ uint32_t nm = 1;
5132
+
5133
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
5134
+ nm++;
5135
+ }
5136
+
5137
+ for (int il = 0; il < n_layer; ++il) {
5138
+ ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
5139
+ n_embd_k_gqa, nm,
5140
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5141
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
5142
+
5143
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
5144
+ n_embd_k_gqa, nm,
5145
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
5146
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
5147
+
5148
+ ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
5149
+ nm, n_embd_v_gqa,
5150
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5151
+ ggml_row_size(kv_self.v_l[il]->type, i));
5152
+
5153
+ ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
5154
+ nm, n_embd_v_gqa,
5155
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
5156
+ ggml_row_size(kv_self.v_l[il]->type, id));
5157
+
5158
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
5159
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
5160
+ }
5161
+
5162
+ i += nm - 1;
5163
+ }
5164
+
5165
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
5166
+
5167
+ return gf;
5168
+ }
5169
+
5093
5170
  struct ggml_cgraph * build_llama() {
5094
5171
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5095
5172
 
@@ -5111,11 +5188,6 @@ struct llm_build_context {
5111
5188
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5112
5189
  cb(KQ_mask, "KQ_mask", -1);
5113
5190
 
5114
- // shift the entire K-cache if needed
5115
- if (do_rope_shift) {
5116
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5117
- }
5118
-
5119
5191
  for (int il = 0; il < n_layer; ++il) {
5120
5192
  struct ggml_tensor * inpSA = inpL;
5121
5193
 
@@ -5151,14 +5223,14 @@ struct llm_build_context {
5151
5223
 
5152
5224
  Qcur = ggml_rope_custom(
5153
5225
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5154
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5226
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5155
5227
  ext_factor, attn_factor, beta_fast, beta_slow
5156
5228
  );
5157
5229
  cb(Qcur, "Qcur", il);
5158
5230
 
5159
5231
  Kcur = ggml_rope_custom(
5160
5232
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5161
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5233
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5162
5234
  ext_factor, attn_factor, beta_fast, beta_slow
5163
5235
  );
5164
5236
  cb(Kcur, "Kcur", il);
@@ -5299,11 +5371,6 @@ struct llm_build_context {
5299
5371
  struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5300
5372
  cb(KQ_pos, "KQ_pos", -1);
5301
5373
 
5302
- // shift the entire K-cache if needed
5303
- if (do_rope_shift) {
5304
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5305
- }
5306
-
5307
5374
  for (int il = 0; il < n_layer; ++il) {
5308
5375
  struct ggml_tensor * inpSA = inpL;
5309
5376
 
@@ -5327,12 +5394,12 @@ struct llm_build_context {
5327
5394
  case MODEL_7B:
5328
5395
  Qcur = ggml_rope_custom(
5329
5396
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5330
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5397
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5331
5398
  ext_factor, attn_factor, beta_fast, beta_slow
5332
5399
  );
5333
5400
  Kcur = ggml_rope_custom(
5334
5401
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5335
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5402
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5336
5403
  ext_factor, attn_factor, beta_fast, beta_slow
5337
5404
  );
5338
5405
  break;
@@ -5417,11 +5484,6 @@ struct llm_build_context {
5417
5484
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5418
5485
  cb(KQ_mask, "KQ_mask", -1);
5419
5486
 
5420
- // shift the entire K-cache if needed
5421
- if (do_rope_shift) {
5422
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5423
- }
5424
-
5425
5487
  for (int il = 0; il < n_layer; ++il) {
5426
5488
  struct ggml_tensor * attn_norm;
5427
5489
 
@@ -5460,13 +5522,13 @@ struct llm_build_context {
5460
5522
 
5461
5523
  // using mode = 2 for neox mode
5462
5524
  Qcur = ggml_rope_custom(
5463
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5525
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5464
5526
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5465
5527
  );
5466
5528
  cb(Qcur, "Qcur", il);
5467
5529
 
5468
5530
  Kcur = ggml_rope_custom(
5469
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5531
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5470
5532
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5471
5533
  );
5472
5534
  cb(Kcur, "Kcur", il);
@@ -5636,10 +5698,6 @@ struct llm_build_context {
5636
5698
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5637
5699
  cb(KQ_mask, "KQ_mask", -1);
5638
5700
 
5639
- if (do_rope_shift) {
5640
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5641
- }
5642
-
5643
5701
  for (int il = 0; il < n_layer; ++il) {
5644
5702
  struct ggml_tensor * residual = inpL;
5645
5703
 
@@ -5697,7 +5755,7 @@ struct llm_build_context {
5697
5755
 
5698
5756
  // RoPE the first n_rot of q/k, pass the other half, and concat.
5699
5757
  struct ggml_tensor * qrot = ggml_view_3d(
5700
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5758
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5701
5759
  ggml_element_size(tmpq) * n_embd_head,
5702
5760
  ggml_element_size(tmpq) * n_embd_head * n_head,
5703
5761
  0
@@ -5705,7 +5763,7 @@ struct llm_build_context {
5705
5763
  cb(qrot, "qrot", il);
5706
5764
 
5707
5765
  struct ggml_tensor * krot = ggml_view_3d(
5708
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5766
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5709
5767
  ggml_element_size(tmpk) * n_embd_head,
5710
5768
  ggml_element_size(tmpk) * n_embd_head * n_head,
5711
5769
  0
@@ -5714,29 +5772,29 @@ struct llm_build_context {
5714
5772
 
5715
5773
  // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5716
5774
  struct ggml_tensor * qpass = ggml_view_3d(
5717
- ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5775
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5718
5776
  ggml_element_size(tmpq) * n_embd_head,
5719
5777
  ggml_element_size(tmpq) * n_embd_head * n_head,
5720
- ggml_element_size(tmpq) * hparams.n_rot
5778
+ ggml_element_size(tmpq) * n_rot
5721
5779
  );
5722
5780
  cb(qpass, "qpass", il);
5723
5781
 
5724
5782
  struct ggml_tensor * kpass = ggml_view_3d(
5725
- ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5783
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5726
5784
  ggml_element_size(tmpk) * n_embd_head,
5727
5785
  ggml_element_size(tmpk) * n_embd_head * n_head,
5728
- ggml_element_size(tmpk) * hparams.n_rot
5786
+ ggml_element_size(tmpk) * n_rot
5729
5787
  );
5730
5788
  cb(kpass, "kpass", il);
5731
5789
 
5732
5790
  struct ggml_tensor * qrotated = ggml_rope_custom(
5733
- ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5791
+ ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5734
5792
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5735
5793
  );
5736
5794
  cb(qrotated, "qrotated", il);
5737
5795
 
5738
5796
  struct ggml_tensor * krotated = ggml_rope_custom(
5739
- ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5797
+ ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
5740
5798
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5741
5799
  );
5742
5800
  cb(krotated, "krotated", il);
@@ -5988,14 +6046,14 @@ struct llm_build_context {
5988
6046
 
5989
6047
  Qcur = ggml_rope_custom(
5990
6048
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5991
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6049
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5992
6050
  ext_factor, attn_factor, beta_fast, beta_slow
5993
6051
  );
5994
6052
  cb(Qcur, "Qcur", il);
5995
6053
 
5996
6054
  Kcur = ggml_rope_custom(
5997
6055
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5998
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6056
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
5999
6057
  ext_factor, attn_factor, beta_fast, beta_slow
6000
6058
  );
6001
6059
  cb(Kcur, "Kcur", il);
@@ -6047,12 +6105,12 @@ struct llm_build_context {
6047
6105
  cur = inpL;
6048
6106
 
6049
6107
  // pooling layer
6050
- if (pooling_type == LLAMA_POOLING_MEAN) {
6108
+ if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
6051
6109
  cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6052
- } else if (pooling_type == LLAMA_POOLING_CLS) {
6110
+ } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
6053
6111
  cur = ggml_get_rows(ctx0, cur, inp_cls);
6054
6112
  } else {
6055
- GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
6113
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
6056
6114
  }
6057
6115
  cb(cur, "result_embd", -1);
6058
6116
 
@@ -6284,11 +6342,6 @@ struct llm_build_context {
6284
6342
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6285
6343
  cb(KQ_mask, "KQ_mask", -1);
6286
6344
 
6287
- // shift the entire K-cache if needed
6288
- if (do_rope_shift) {
6289
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6290
- }
6291
-
6292
6345
  for (int il = 0; il < n_layer; ++il) {
6293
6346
  struct ggml_tensor * inpSA = inpL;
6294
6347
 
@@ -6325,14 +6378,14 @@ struct llm_build_context {
6325
6378
 
6326
6379
  Qcur = ggml_rope_custom(
6327
6380
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6328
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6381
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6329
6382
  ext_factor, attn_factor, beta_fast, beta_slow
6330
6383
  );
6331
6384
  cb(Qcur, "Qcur", il);
6332
6385
 
6333
6386
  Kcur = ggml_rope_custom(
6334
6387
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6335
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6388
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6336
6389
  ext_factor, attn_factor, beta_fast, beta_slow
6337
6390
  );
6338
6391
  cb(Kcur, "Kcur", il);
@@ -6407,11 +6460,6 @@ struct llm_build_context {
6407
6460
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6408
6461
  cb(KQ_mask, "KQ_mask", -1);
6409
6462
 
6410
- // shift the entire K-cache if needed
6411
- if (do_rope_shift) {
6412
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6413
- }
6414
-
6415
6463
  for (int il = 0; il < n_layer; ++il) {
6416
6464
  struct ggml_tensor * inpSA = inpL;
6417
6465
 
@@ -6441,13 +6489,13 @@ struct llm_build_context {
6441
6489
 
6442
6490
  // using mode = 2 for neox mode
6443
6491
  Qcur = ggml_rope_custom(
6444
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6492
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6445
6493
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6446
6494
  );
6447
6495
  cb(Qcur, "Qcur", il);
6448
6496
 
6449
6497
  Kcur = ggml_rope_custom(
6450
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6498
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6451
6499
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6452
6500
  );
6453
6501
  cb(Kcur, "Kcur", il);
@@ -6521,11 +6569,6 @@ struct llm_build_context {
6521
6569
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6522
6570
  cb(KQ_mask, "KQ_mask", -1);
6523
6571
 
6524
- // shift the entire K-cache if needed
6525
- if (do_rope_shift) {
6526
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6527
- }
6528
-
6529
6572
  for (int il = 0; il < n_layer; ++il) {
6530
6573
  struct ggml_tensor * inpSA = inpL;
6531
6574
 
@@ -6561,14 +6604,14 @@ struct llm_build_context {
6561
6604
 
6562
6605
  Qcur = ggml_rope_custom(
6563
6606
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6564
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6607
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6565
6608
  ext_factor, attn_factor, beta_fast, beta_slow
6566
6609
  );
6567
6610
  cb(Qcur, "Qcur", il);
6568
6611
 
6569
6612
  Kcur = ggml_rope_custom(
6570
6613
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6571
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6614
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6572
6615
  ext_factor, attn_factor, beta_fast, beta_slow
6573
6616
  );
6574
6617
  cb(Kcur, "Kcur", il);
@@ -6642,11 +6685,6 @@ struct llm_build_context {
6642
6685
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6643
6686
  cb(KQ_mask, "KQ_mask", -1);
6644
6687
 
6645
- // shift the entire K-cache if needed
6646
- if (do_rope_shift) {
6647
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6648
- }
6649
-
6650
6688
  for (int il = 0; il < n_layer; ++il) {
6651
6689
  attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
6652
6690
  model.layers[il].attn_norm,
@@ -6684,7 +6722,7 @@ struct llm_build_context {
6684
6722
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6685
6723
 
6686
6724
  Qcur = ggml_rope_custom(
6687
- ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6725
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6688
6726
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6689
6727
  );
6690
6728
  cb(Qcur, "Qcur", il);
@@ -6695,7 +6733,7 @@ struct llm_build_context {
6695
6733
  cb(Qcur, "Qcur", il);
6696
6734
 
6697
6735
  Kcur = ggml_rope_custom(
6698
- ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
6736
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
6699
6737
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
6700
6738
  );
6701
6739
  cb(Kcur, "Kcur", il);
@@ -6764,11 +6802,6 @@ struct llm_build_context {
6764
6802
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6765
6803
  cb(KQ_mask, "KQ_mask", -1);
6766
6804
 
6767
- // shift the entire K-cache if needed
6768
- if (do_rope_shift) {
6769
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6770
- }
6771
-
6772
6805
  for (int il = 0; il < n_layer; ++il) {
6773
6806
 
6774
6807
  // norm
@@ -6792,14 +6825,14 @@ struct llm_build_context {
6792
6825
  cb(Vcur, "Vcur", il);
6793
6826
 
6794
6827
  Qcur = ggml_rope_custom(
6795
- ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
6796
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6828
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
6829
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6797
6830
  ext_factor, attn_factor, beta_fast, beta_slow);
6798
6831
  cb(Qcur, "Qcur", il);
6799
6832
 
6800
6833
  Kcur = ggml_rope_custom(
6801
- ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
6802
- n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
6834
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
6835
+ n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6803
6836
  ext_factor, attn_factor, beta_fast, beta_slow);
6804
6837
  cb(Kcur, "Kcur", il);
6805
6838
 
@@ -6969,11 +7002,6 @@ struct llm_build_context {
6969
7002
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6970
7003
  cb(KQ_mask, "KQ_mask", -1);
6971
7004
 
6972
- // shift the entire K-cache if needed
6973
- if (do_rope_shift) {
6974
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6975
- }
6976
-
6977
7005
  for (int il = 0; il < n_layer; ++il) {
6978
7006
  cur = llm_build_norm(ctx0, inpL, hparams,
6979
7007
  model.layers[il].attn_norm,
@@ -6999,14 +7027,14 @@ struct llm_build_context {
6999
7027
 
7000
7028
  struct ggml_tensor * Qcur = ggml_rope_custom(
7001
7029
  ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
7002
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7030
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7003
7031
  ext_factor, attn_factor, beta_fast, beta_slow
7004
7032
  );
7005
7033
  cb(Qcur, "Qcur", il);
7006
7034
 
7007
7035
  struct ggml_tensor * Kcur = ggml_rope_custom(
7008
7036
  ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
7009
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7037
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7010
7038
  ext_factor, attn_factor, beta_fast, beta_slow
7011
7039
  );
7012
7040
  cb(Kcur, "Kcur", il);
@@ -7077,11 +7105,6 @@ struct llm_build_context {
7077
7105
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7078
7106
  cb(KQ_mask, "KQ_mask", -1);
7079
7107
 
7080
- // shift the entire K-cache if needed
7081
- if (do_rope_shift) {
7082
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7083
- }
7084
-
7085
7108
  for (int il = 0; il < n_layer; ++il) {
7086
7109
  struct ggml_tensor * inpSA = inpL;
7087
7110
 
@@ -7117,14 +7140,14 @@ struct llm_build_context {
7117
7140
 
7118
7141
  Qcur = ggml_rope_custom(
7119
7142
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7120
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7143
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7121
7144
  ext_factor, attn_factor, beta_fast, beta_slow
7122
7145
  );
7123
7146
  cb(Qcur, "Qcur", il);
7124
7147
 
7125
7148
  Kcur = ggml_rope_custom(
7126
7149
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7127
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
7150
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7128
7151
  ext_factor, attn_factor, beta_fast, beta_slow
7129
7152
  );
7130
7153
  cb(Kcur, "Kcur", il);
@@ -7196,11 +7219,6 @@ struct llm_build_context {
7196
7219
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7197
7220
  cb(KQ_mask, "KQ_mask", -1);
7198
7221
 
7199
- // shift the entire K-cache if needed
7200
- if (do_rope_shift) {
7201
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7202
- }
7203
-
7204
7222
  for (int il = 0; il < n_layer; ++il) {
7205
7223
  struct ggml_tensor * inpSA = inpL;
7206
7224
 
@@ -7236,14 +7254,14 @@ struct llm_build_context {
7236
7254
 
7237
7255
  Qcur = ggml_rope_custom(
7238
7256
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7239
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7257
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7240
7258
  ext_factor, attn_factor, beta_fast, beta_slow
7241
7259
  );
7242
7260
  cb(Qcur, "Qcur", il);
7243
7261
 
7244
7262
  Kcur = ggml_rope_custom(
7245
7263
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7246
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7264
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7247
7265
  ext_factor, attn_factor, beta_fast, beta_slow
7248
7266
  );
7249
7267
  cb(Kcur, "Kcur", il);
@@ -7328,11 +7346,6 @@ struct llm_build_context {
7328
7346
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7329
7347
  cb(KQ_mask, "KQ_mask", -1);
7330
7348
 
7331
- // shift the entire K-cache if needed
7332
- if (do_rope_shift) {
7333
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7334
- }
7335
-
7336
7349
  for (int il = 0; il < n_layer; ++il) {
7337
7350
  struct ggml_tensor * inpSA = inpL;
7338
7351
 
@@ -7368,14 +7381,14 @@ struct llm_build_context {
7368
7381
 
7369
7382
  Qcur = ggml_rope_custom(
7370
7383
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7371
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7384
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7372
7385
  ext_factor, attn_factor, beta_fast, beta_slow
7373
7386
  );
7374
7387
  cb(Qcur, "Qcur", il);
7375
7388
 
7376
7389
  Kcur = ggml_rope_custom(
7377
7390
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7378
- hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7391
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7379
7392
  ext_factor, attn_factor, beta_fast, beta_slow
7380
7393
  );
7381
7394
  cb(Kcur, "Kcur", il);
@@ -7464,11 +7477,6 @@ struct llm_build_context {
7464
7477
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7465
7478
  cb(KQ_mask, "KQ_mask", -1);
7466
7479
 
7467
- // shift the entire K-cache if needed
7468
- if (do_rope_shift) {
7469
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
7470
- }
7471
-
7472
7480
  for (int il = 0; il < n_layer; ++il) {
7473
7481
 
7474
7482
  // norm
@@ -7491,7 +7499,7 @@ struct llm_build_context {
7491
7499
 
7492
7500
  Qcur = ggml_rope_custom(
7493
7501
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
7494
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7502
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7495
7503
  ext_factor, attn_factor, beta_fast, beta_slow);
7496
7504
  cb(Qcur, "Qcur", il);
7497
7505
 
@@ -7500,7 +7508,7 @@ struct llm_build_context {
7500
7508
 
7501
7509
  Kcur = ggml_rope_custom(
7502
7510
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
7503
- n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
7511
+ n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7504
7512
  ext_factor, attn_factor, beta_fast, beta_slow);
7505
7513
  cb(Kcur, "Kcur", il);
7506
7514
 
@@ -7553,6 +7561,40 @@ struct llm_build_context {
7553
7561
  }
7554
7562
  };
7555
7563
 
7564
+ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
7565
+ llama_batch dummy;
7566
+ dummy.n_tokens = 0;
7567
+
7568
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7569
+
7570
+ struct llm_build_context llm(lctx, dummy, cb, false);
7571
+
7572
+ llm.init();
7573
+
7574
+ struct ggml_cgraph * result = llm.build_defrag(ids);
7575
+
7576
+ llm.free();
7577
+
7578
+ return result;
7579
+ }
7580
+
7581
+ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7582
+ llama_batch dummy;
7583
+ dummy.n_tokens = 0;
7584
+
7585
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
7586
+
7587
+ struct llm_build_context llm(lctx, dummy, cb, false);
7588
+
7589
+ llm.init();
7590
+
7591
+ struct ggml_cgraph * result = llm.build_k_shift();
7592
+
7593
+ llm.free();
7594
+
7595
+ return result;
7596
+ }
7597
+
7556
7598
  static struct ggml_cgraph * llama_build_graph(
7557
7599
  llama_context & lctx,
7558
7600
  const llama_batch & batch,
@@ -7672,6 +7714,20 @@ static struct ggml_cgraph * llama_build_graph(
7672
7714
  return result;
7673
7715
  }
7674
7716
 
7717
+ static void llama_set_k_shift(llama_context & lctx) {
7718
+ const auto & cparams = lctx.cparams;
7719
+
7720
+ const int64_t n_ctx = cparams.n_ctx;
7721
+
7722
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7723
+
7724
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7725
+
7726
+ for (int i = 0; i < n_ctx; ++i) {
7727
+ data[i] = lctx.kv_self.cells[i].delta;
7728
+ }
7729
+ }
7730
+
7675
7731
  static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7676
7732
  //
7677
7733
  // set input data
@@ -7739,19 +7795,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7739
7795
  }
7740
7796
  }
7741
7797
 
7742
- if (kv_self.has_shift) {
7743
- const int64_t n_ctx = cparams.n_ctx;
7744
-
7745
- assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7746
-
7747
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7748
-
7749
- for (int i = 0; i < n_ctx; ++i) {
7750
- data[i] = lctx.kv_self.cells[i].delta;
7751
- }
7752
- }
7753
-
7754
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
7798
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7755
7799
  const int64_t n_tokens = batch.n_tokens;
7756
7800
 
7757
7801
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@@ -7779,7 +7823,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7779
7823
  }
7780
7824
  }
7781
7825
 
7782
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7826
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7783
7827
  const int64_t n_tokens = batch.n_tokens;
7784
7828
 
7785
7829
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@@ -7795,6 +7839,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7795
7839
  }
7796
7840
  }
7797
7841
 
7842
+ static void llama_graph_compute(
7843
+ llama_context & lctx,
7844
+ ggml_cgraph * gf,
7845
+ int n_threads) {
7846
+ #ifdef GGML_USE_MPI
7847
+ const int64_t n_layer = lctx.model.hparams.n_layer;
7848
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7849
+ #endif
7850
+
7851
+ #ifdef GGML_USE_METAL
7852
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
7853
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7854
+ }
7855
+ #endif
7856
+
7857
+ if (lctx.backend_cpu != nullptr) {
7858
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7859
+ }
7860
+
7861
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
7862
+
7863
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7864
+
7865
+ #ifdef GGML_USE_MPI
7866
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7867
+ #endif
7868
+ }
7869
+
7798
7870
  // decode a batch of tokens by evaluating the transformer
7799
7871
  //
7800
7872
  // - lctx: llama context
@@ -7821,9 +7893,9 @@ static int llama_decode_internal(
7821
7893
  const auto n_batch = cparams.n_batch;
7822
7894
 
7823
7895
  GGML_ASSERT(n_tokens <= n_batch);
7896
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7824
7897
 
7825
7898
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
7826
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
7827
7899
 
7828
7900
  const int64_t t_start_us = ggml_time_us();
7829
7901
 
@@ -7872,6 +7944,8 @@ static int llama_decode_internal(
7872
7944
  batch.seq_id = seq_id_arr.data();
7873
7945
  }
7874
7946
 
7947
+ llama_kv_cache_update(&lctx);
7948
+
7875
7949
  // if we have enough unused cells before the current head ->
7876
7950
  // better to start searching from the beginning of the cache, hoping to fill it
7877
7951
  if (kv_self.head > kv_self.used + 2*n_tokens) {
@@ -7896,8 +7970,9 @@ static int llama_decode_internal(
7896
7970
  ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7897
7971
 
7898
7972
  // the output is always the last tensor in the graph
7899
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7973
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7900
7974
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7975
+
7901
7976
  if (strcmp(res->name, "result_output") == 0) {
7902
7977
  // the embeddings could be the second to last tensor, or the third to last tensor
7903
7978
  if (strcmp(embeddings->name, "result_norm") != 0) {
@@ -7924,40 +7999,12 @@ static int llama_decode_internal(
7924
7999
  n_threads = std::min(4, n_threads);
7925
8000
  }
7926
8001
 
7927
- #ifdef GGML_USE_MPI
7928
- const int64_t n_layer = hparams.n_layer;
7929
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
7930
- #endif
7931
-
7932
- #ifdef GGML_USE_METAL
7933
- if (ggml_backend_is_metal(lctx.backend_metal)) {
7934
- ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
7935
- }
7936
- #endif
7937
-
7938
- if (lctx.backend_cpu != nullptr) {
7939
- ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7940
- }
7941
-
7942
8002
  llama_set_inputs(lctx, batch);
7943
8003
 
7944
- ggml_backend_sched_graph_compute(lctx.sched, gf);
7945
-
7946
- // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7947
-
7948
- #ifdef GGML_USE_MPI
7949
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
7950
- #endif
8004
+ llama_graph_compute(lctx, gf, n_threads);
7951
8005
 
7952
8006
  // update the kv ring buffer
7953
8007
  {
7954
- if (kv_self.has_shift) {
7955
- kv_self.has_shift = false;
7956
- for (uint32_t i = 0; i < kv_self.size; ++i) {
7957
- kv_self.cells[i].delta = 0;
7958
- }
7959
- }
7960
-
7961
8008
  kv_self.head += n_tokens;
7962
8009
 
7963
8010
  // Ensure kv cache head points to a valid index.
@@ -7966,6 +8013,18 @@ static int llama_decode_internal(
7966
8013
  }
7967
8014
  }
7968
8015
 
8016
+ // decide if we need to defrag the kv cache
8017
+ if (cparams.defrag_thold >= 0.0f) {
8018
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8019
+
8020
+ // queue defragmentation for next llama_kv_cache_update
8021
+ if (fragmentation > cparams.defrag_thold) {
8022
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8023
+
8024
+ llama_kv_cache_defrag(kv_self);
8025
+ }
8026
+ }
8027
+
7969
8028
  #ifdef GGML_PERF
7970
8029
  // print timing information per ggml operation (for debugging purposes)
7971
8030
  // requires GGML_PERF to be defined
@@ -8053,6 +8112,245 @@ static int llama_decode_internal(
8053
8112
  return 0;
8054
8113
  }
8055
8114
 
8115
+ // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
8116
+ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8117
+ auto & kv_self = lctx.kv_self;
8118
+
8119
+ const auto & hparams = lctx.model.hparams;
8120
+
8121
+ const uint32_t n_layer = hparams.n_layer;
8122
+
8123
+ const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8124
+ const uint32_t n_used = kv_self.used;
8125
+
8126
+ assert(n_used <= n_kv);
8127
+
8128
+ //const int64_t t_start = ggml_time_us();
8129
+
8130
+ // number of cells moved
8131
+ uint32_t n_moves = 0;
8132
+
8133
+ // determine which KV cells to move where
8134
+ //
8135
+ // cell i moves to ids[i]
8136
+ //
8137
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
8138
+ //
8139
+ std::vector<uint32_t> ids(n_kv, n_kv);
8140
+
8141
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
8142
+ const auto & cell0 = kv_self.cells[i0];
8143
+
8144
+ if (!cell0.is_empty()) {
8145
+ ids[i0] = i0;
8146
+
8147
+ continue;
8148
+ }
8149
+
8150
+ // found a hole - fill it with data from the end of the cache
8151
+
8152
+ uint32_t nh = 1;
8153
+
8154
+ // determine the size of the hole
8155
+ while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
8156
+ nh++;
8157
+ }
8158
+
8159
+ // each move requires 6*n_layer tensors (see build_defrag)
8160
+ // - source view, destination view, copy operation
8161
+ // - x2 for keys and values
8162
+ //
8163
+ if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8164
+ // the graph is too big, we cannot move more cells
8165
+ break;
8166
+ }
8167
+
8168
+ uint32_t nf = 0;
8169
+ uint32_t is = n_kv - 1;
8170
+
8171
+ // starting from the end, find nh non-empty cells
8172
+ for (; is > i0; --is) {
8173
+ const auto & cell1 = kv_self.cells[is];
8174
+
8175
+ if (cell1.is_empty() || ids[is] != n_kv) {
8176
+ continue;
8177
+ }
8178
+
8179
+ // non-empty cell which is not yet moved
8180
+ nf++;
8181
+
8182
+ if (nf == nh) {
8183
+ break;
8184
+ }
8185
+ }
8186
+
8187
+ // this can only happen if `n_used` is not accurate, which would be a bug
8188
+ GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
8189
+
8190
+ nf = 0;
8191
+
8192
+ uint32_t i1 = is;
8193
+
8194
+ // are we moving a continuous block of memory?
8195
+ bool cont = false;
8196
+
8197
+ // go back and move the nf cells to the hole
8198
+ for (; i1 < n_kv; ++i1) {
8199
+ auto & cell1 = kv_self.cells[i1];
8200
+
8201
+ if (cell1.is_empty() || ids[i1] != n_kv) {
8202
+ cont = false;
8203
+ continue;
8204
+ }
8205
+
8206
+ // this cell goes to (i0 + nf)
8207
+ ids[i1] = i0 + nf;
8208
+
8209
+ // move the cell meta data
8210
+ kv_self.cells[i0 + nf] = cell1;
8211
+
8212
+ // clear the old cell and move the head there
8213
+ cell1 = llama_kv_cell();
8214
+ kv_self.head = n_used;
8215
+
8216
+ if (!cont) {
8217
+ n_moves++;
8218
+ cont = true;
8219
+ }
8220
+
8221
+ nf++;
8222
+
8223
+ if (nf == nh) {
8224
+ break;
8225
+ }
8226
+ }
8227
+
8228
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
8229
+
8230
+ i0 += nh - 1;
8231
+ }
8232
+
8233
+ if (n_moves == 0) {
8234
+ return;
8235
+ }
8236
+
8237
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
8238
+
8239
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
8240
+
8241
+ #if 0
8242
+ // CPU defrag
8243
+ //
8244
+ // TODO: optimizations are possible:
8245
+ // - multiple threads
8246
+ // - avoid copying to the host memory when already there
8247
+ //
8248
+ // likely not worth the effort, as we have ggml_graph based defrag
8249
+ //
8250
+
8251
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8252
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8253
+
8254
+ const uint32_t kv_size = kv_self.size;
8255
+
8256
+ std::vector<uint8_t> buf_k;
8257
+ std::vector<uint8_t> buf_v;
8258
+
8259
+ for (uint32_t il = 0; il < n_layer; ++il) {
8260
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
8261
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
8262
+
8263
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
8264
+ const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
8265
+
8266
+ buf_k.resize(k_size);
8267
+ buf_v.resize(v_size);
8268
+
8269
+ ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8270
+ ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8271
+
8272
+ // batch move [i, i+nm) to [id, id+nm)
8273
+ // note: cells can move only to a lower index
8274
+ for (uint32_t i = 0; i < n_kv; ++i) {
8275
+ const uint32_t id = ids[i];
8276
+
8277
+ if (i == id || id == n_kv) {
8278
+ continue;
8279
+ }
8280
+
8281
+ uint32_t nm = 1;
8282
+
8283
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
8284
+ nm++;
8285
+ }
8286
+
8287
+ // move keys
8288
+ {
8289
+ const int64_t os = i*k_size_row;
8290
+ const int64_t od = id*k_size_row;
8291
+
8292
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
8293
+ }
8294
+
8295
+ // move values (note: they are transposed)
8296
+ {
8297
+ const int64_t os = i;
8298
+ const int64_t od = id;
8299
+
8300
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
8301
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
8302
+ }
8303
+ }
8304
+
8305
+ i += nm - 1;
8306
+ }
8307
+
8308
+ ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
8309
+ ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
8310
+ }
8311
+ #else
8312
+ // ggml_graph defrag
8313
+
8314
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8315
+
8316
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8317
+ #endif
8318
+
8319
+ //const int64_t t_end = ggml_time_us();
8320
+
8321
+ //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
8322
+ }
8323
+
8324
+ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
8325
+ // apply K-shift if needed
8326
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
8327
+ llama_set_k_shift(lctx);
8328
+
8329
+ {
8330
+ ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
8331
+
8332
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8333
+ }
8334
+
8335
+ {
8336
+ auto & kv_self = lctx.kv_self;
8337
+
8338
+ kv_self.has_shift = false;
8339
+
8340
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
8341
+ kv_self.cells[i].delta = 0;
8342
+ }
8343
+ }
8344
+ }
8345
+
8346
+ // defragment the KV cache if needed
8347
+ if (lctx.kv_self.do_defrag) {
8348
+ llama_kv_cache_defrag_internal(lctx);
8349
+
8350
+ lctx.kv_self.do_defrag = false;
8351
+ }
8352
+ }
8353
+
8056
8354
  //
8057
8355
  // tokenizer
8058
8356
  //
@@ -8644,37 +8942,46 @@ struct llm_tokenizer_wpm {
8644
8942
  }
8645
8943
 
8646
8944
  std::vector<std::string> preprocess(const std::string & text) {
8647
- std::string ori_str = normalize(text);
8648
- uint64_t ori_size = ori_str.size();
8945
+ // normalalization form D
8946
+ std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
8947
+ std::vector<uint32_t> nfd_codepoints;
8948
+ for (uint32_t code : codepoints) {
8949
+ auto it = nfd_map.equal_range(code);
8950
+ if (it.first != it.second) {
8951
+ for (auto jt = it.first; jt != it.second; jt++) {
8952
+ nfd_codepoints.push_back(jt->second);
8953
+ }
8954
+ } else {
8955
+ nfd_codepoints.push_back(code);
8956
+ }
8957
+ }
8649
8958
 
8650
- // single punct / single symbol / single digit
8651
- // baseline: add whitespace on the left and right of punct and chinese characters
8652
- std::vector<std::string> words;
8959
+ // strip accents, strip control, uniformize whitespace,
8960
+ // to lowercase, pad chinese characters, pad punctuation
8653
8961
  std::string new_str = "";
8654
- uint64_t i = 0;
8655
- while (i < ori_size) {
8656
- int utf_char_len = utf8_len(ori_str[i]);
8657
- if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8658
- new_str += " ";
8659
- new_str += ori_str[i];
8660
- new_str += " ";
8661
- i += 1;
8962
+ for (uint32_t code : nfd_codepoints) {
8963
+ int type = codepoint_type(code);
8964
+ if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
8965
+ continue;
8662
8966
  }
8663
- else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8967
+ code = to_lower(code);
8968
+ if (type == CODEPOINT_TYPE_WHITESPACE) {
8969
+ code = ' ';
8970
+ }
8971
+ std::string s = codepoint_to_utf8(code);
8972
+ if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
8664
8973
  new_str += " ";
8665
- new_str += ori_str.substr(i, 3);
8974
+ new_str += s;
8666
8975
  new_str += " ";
8667
- i += 3;
8668
- }
8669
- else {
8670
- new_str += ori_str[i];
8671
- i += 1;
8976
+ } else {
8977
+ new_str += s;
8672
8978
  }
8673
8979
  }
8674
8980
 
8675
8981
  // split by whitespace
8676
8982
  uint64_t l = 0;
8677
8983
  uint64_t r = 0;
8984
+ std::vector<std::string> words;
8678
8985
  while (r < new_str.size()) {
8679
8986
  // if is whitespace
8680
8987
  if (isspace(new_str[r])) {
@@ -8692,47 +8999,21 @@ struct llm_tokenizer_wpm {
8692
8999
  return words;
8693
9000
  }
8694
9001
 
8695
- std::string normalize(const std::string & text) {
8696
- // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8697
- std::string text2 = strip_accents(text);
8698
- for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8699
- char c = text2[i];
8700
- if (c >= 'A' && c <= 'Z') {
8701
- text2[i] = c - 'A' + 'a';
8702
- }
9002
+ uint32_t to_lower(uint32_t code) {
9003
+ static const std::locale locale("en_US.UTF-8");
9004
+ #if defined(_WIN32)
9005
+ if (code > 0xFFFF) {
9006
+ return code;
8703
9007
  }
8704
- return text2;
9008
+ #endif
9009
+ return std::tolower(wchar_t(code), locale);
8705
9010
  }
8706
9011
 
8707
- bool is_chinese_char(const std::string & str) {
8708
- int len = str.length();
8709
- unsigned int codepoint = 0;
8710
- int num_bytes = 0;
8711
- int i = 0;
8712
- unsigned char ch = static_cast<unsigned char>(str[i]);
8713
- if (ch <= 0x7f) {
8714
- codepoint = ch;
8715
- num_bytes = 1;
8716
- } else if ((ch >> 5) == 0x06) {
8717
- codepoint = ch & 0x1f;
8718
- num_bytes = 2;
8719
- } else if ((ch >> 4) == 0x0e) {
8720
- codepoint = ch & 0x0f;
8721
- num_bytes = 3;
8722
- } else if ((ch >> 3) == 0x1e) {
8723
- codepoint = ch & 0x07;
8724
- num_bytes = 4;
8725
- }
8726
- for (int j = 1; j < num_bytes; ++j) {
8727
- if (i + j >= len) {
8728
- return false; // incomplete UTF-8 character
8729
- }
8730
- unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8731
- if ((next_ch >> 6) != 0x02) {
8732
- return false; // invalid trailing byte
8733
- }
8734
- codepoint = (codepoint << 6) | (next_ch & 0x3f);
8735
- }
9012
+ bool is_ascii_punct(uint32_t code) {
9013
+ return code < 256 && ispunct(code);
9014
+ }
9015
+
9016
+ bool is_chinese_char(uint32_t codepoint) {
8736
9017
  if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8737
9018
  (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8738
9019
  (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
@@ -8748,41 +9029,6 @@ struct llm_tokenizer_wpm {
8748
9029
  return false;
8749
9030
  }
8750
9031
 
8751
- std::string strip_accents(const std::string & input_string) {
8752
- std::string resultString;
8753
- std::map<std::string, char> accent_map = {
8754
- {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8755
- {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8756
- {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8757
- {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8758
- {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8759
- {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8760
- {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8761
- {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8762
- {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8763
- };
8764
-
8765
- for (size_t i = 0; i < input_string.length();) {
8766
- int len = utf8_len(input_string[i]);
8767
- std::string curChar = input_string.substr(i, len);
8768
- auto iter = accent_map.find(curChar);
8769
- if (iter != accent_map.end()) {
8770
- resultString += iter->second;
8771
- } else {
8772
- resultString += curChar;
8773
- }
8774
- i += len;
8775
- }
8776
-
8777
- return resultString;
8778
- }
8779
-
8780
- static size_t utf8_len(char src) {
8781
- const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8782
- uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8783
- return lookup[highbits];
8784
- }
8785
-
8786
9032
  const llama_vocab & vocab;
8787
9033
  };
8788
9034
 
@@ -9816,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
9816
10062
  }
9817
10063
  }
9818
10064
 
9819
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
9820
- llama_sample_temp(ctx, candidates_p, temp);
9821
- }
9822
-
9823
10065
  void llama_sample_repetition_penalties(
9824
10066
  struct llama_context * ctx,
9825
10067
  llama_token_data_array * candidates,
@@ -9946,38 +10188,6 @@ void llama_sample_apply_guidance(
9946
10188
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9947
10189
  }
9948
10190
 
9949
- void llama_sample_classifier_free_guidance(
9950
- struct llama_context * ctx,
9951
- llama_token_data_array * candidates,
9952
- struct llama_context * guidance_ctx,
9953
- float scale) {
9954
- GGML_ASSERT(ctx);
9955
- int64_t t_start_sample_us;
9956
-
9957
- t_start_sample_us = ggml_time_us();
9958
- const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
9959
-
9960
- GGML_ASSERT(n_vocab == candidates->size);
9961
- GGML_ASSERT(!candidates->sorted);
9962
-
9963
- std::vector<float> logits_base(n_vocab);
9964
- for (size_t i = 0; i < n_vocab; ++i) {
9965
- logits_base[i] = candidates->data[i].logit;
9966
- }
9967
-
9968
- float * logits_guidance = llama_get_logits(guidance_ctx);
9969
-
9970
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9971
- llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
9972
- t_start_sample_us = ggml_time_us();
9973
-
9974
- for (size_t i = 0; i < n_vocab; ++i) {
9975
- candidates->data[i].logit = logits_base[i];
9976
- }
9977
-
9978
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
9979
- }
9980
-
9981
10191
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
9982
10192
  GGML_ASSERT(ctx);
9983
10193
 
@@ -10508,31 +10718,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10508
10718
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10509
10719
  new_type = GGML_TYPE_Q8_0;
10510
10720
  }
10511
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10721
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10722
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10512
10723
  new_type = GGML_TYPE_Q5_K;
10513
10724
  }
10514
10725
  else if (new_type != GGML_TYPE_Q8_0) {
10515
10726
  new_type = GGML_TYPE_Q6_K;
10516
10727
  }
10517
10728
  } else if (name == "token_embd.weight") {
10518
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10729
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
10730
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10519
10731
  new_type = GGML_TYPE_Q2_K;
10520
10732
  }
10733
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10734
+ new_type = GGML_TYPE_IQ3_S;
10735
+ }
10521
10736
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10522
- new_type = GGML_TYPE_Q4_K;
10737
+ new_type = GGML_TYPE_IQ3_S;
10523
10738
  }
10524
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10739
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
10740
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
10525
10741
  if (name.find("attn_v.weight") != std::string::npos) {
10526
10742
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10527
- else new_type = GGML_TYPE_Q2_K;
10743
+ else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10528
10744
  ++qs.i_attention_wv;
10529
10745
  }
10746
+ else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
10747
+ new_type = GGML_TYPE_Q4_K;
10748
+ }
10530
10749
  else if (name.find("ffn_down") != std::string::npos) {
10531
- if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
10750
+ if (qs.i_ffn_down < qs.n_ffn_down/8) {
10751
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
10752
+ }
10532
10753
  ++qs.i_ffn_down;
10533
10754
  }
10534
10755
  else if (name.find("attn_output.weight") != std::string::npos) {
10535
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10756
+ if (qs.model.hparams.n_expert == 8) {
10757
+ new_type = GGML_TYPE_Q5_K;
10758
+ } else {
10759
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10760
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
10761
+ }
10536
10762
  }
10537
10763
  } else if (name.find("attn_v.weight") != std::string::npos) {
10538
10764
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
@@ -10542,13 +10768,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10542
10768
  new_type = GGML_TYPE_Q4_K;
10543
10769
  }
10544
10770
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10545
- new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
10771
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
10772
+ }
10773
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
10774
+ new_type = GGML_TYPE_Q4_K;
10775
+ }
10776
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10777
+ new_type = GGML_TYPE_Q4_K;
10778
+ }
10779
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
10780
+ new_type = GGML_TYPE_Q4_K;
10781
+ }
10782
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10783
+ new_type = GGML_TYPE_Q4_K;
10546
10784
  }
10547
10785
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
10548
10786
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
10549
10787
  }
10550
10788
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10551
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
10789
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
10552
10790
  new_type = GGML_TYPE_Q5_K;
10553
10791
  }
10554
10792
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -10574,14 +10812,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10574
10812
  // TODO: explore better strategies
10575
10813
  new_type = GGML_TYPE_Q8_0;
10576
10814
  }
10577
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10578
- new_type = GGML_TYPE_Q2_K;
10815
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
10816
+ new_type = GGML_TYPE_IQ3_XXS;
10817
+ }
10818
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10819
+ new_type = GGML_TYPE_IQ2_S;
10820
+ }
10821
+ } else if (name.find("attn_q.weight") != std::string::npos) {
10822
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
10823
+ new_type = GGML_TYPE_IQ3_XXS;
10824
+ }
10825
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10826
+ new_type = GGML_TYPE_IQ2_S;
10579
10827
  }
10580
10828
  } else if (name.find("ffn_down") != std::string::npos) {
10581
10829
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
10582
10830
  int i_layer = info.first, n_layer = info.second;
10583
10831
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10584
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
10832
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
10585
10833
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
10586
10834
  }
10587
10835
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
@@ -10592,6 +10840,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10592
10840
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
10593
10841
  : GGML_TYPE_Q3_K;
10594
10842
  }
10843
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
10844
+ (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
10845
+ new_type = GGML_TYPE_Q4_K;
10846
+ }
10595
10847
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
10596
10848
  new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
10597
10849
  }
@@ -10603,8 +10855,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10603
10855
  if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10604
10856
  }
10605
10857
  }
10606
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
10607
- if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
10858
+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
10859
+ new_type = GGML_TYPE_Q5_K;
10608
10860
  }
10609
10861
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
10610
10862
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
@@ -10621,39 +10873,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10621
10873
  } else if (name.find("attn_output.weight") != std::string::npos) {
10622
10874
  if (arch != LLM_ARCH_FALCON) {
10623
10875
  if (qs.model.hparams.n_expert == 8) {
10624
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10876
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
10625
10877
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
10626
- ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
10878
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
10879
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
10627
10880
  new_type = GGML_TYPE_Q5_K;
10628
10881
  }
10629
10882
  } else {
10630
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10631
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
10632
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
10633
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
10883
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
10884
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
10885
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
10886
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
10887
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
10634
10888
  }
10635
10889
  } else {
10636
10890
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10637
10891
  }
10638
10892
  }
10639
10893
  else if (name.find("attn_qkv.weight") != std::string::npos) {
10640
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
10894
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
10895
+ new_type = GGML_TYPE_Q4_K;
10896
+ }
10641
10897
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
10642
10898
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
10643
10899
  }
10644
10900
  else if (name.find("ffn_gate") != std::string::npos) {
10645
10901
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
10646
10902
  int i_layer = info.first, n_layer = info.second;
10647
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10648
- new_type = GGML_TYPE_Q2_K;
10903
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10904
+ new_type = GGML_TYPE_IQ3_XXS;
10649
10905
  }
10650
10906
  ++qs.i_ffn_gate;
10651
10907
  }
10652
10908
  else if (name.find("ffn_up") != std::string::npos) {
10653
10909
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
10654
10910
  int i_layer = info.first, n_layer = info.second;
10655
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
10656
- new_type = GGML_TYPE_Q2_K;
10911
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
10912
+ new_type = GGML_TYPE_IQ3_XXS;
10657
10913
  }
10658
10914
  ++qs.i_ffn_up;
10659
10915
  }
@@ -10671,9 +10927,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10671
10927
  //}
10672
10928
  bool convert_incompatible_tensor = false;
10673
10929
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10674
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10675
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10676
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10930
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
10931
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
10932
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
10677
10933
  int nx = tensor->ne[0];
10678
10934
  int ny = tensor->ne[1];
10679
10935
  if (nx % QK_K != 0) {
@@ -10687,13 +10943,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10687
10943
  switch (new_type) {
10688
10944
  case GGML_TYPE_IQ2_XXS:
10689
10945
  case GGML_TYPE_IQ2_XS:
10946
+ case GGML_TYPE_IQ2_S:
10690
10947
  case GGML_TYPE_IQ3_XXS:
10948
+ case GGML_TYPE_IQ3_S:
10691
10949
  case GGML_TYPE_IQ1_S:
10692
10950
  case GGML_TYPE_Q2_K:
10693
- case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
10694
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10695
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10696
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10951
+ case GGML_TYPE_Q3_K:
10952
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
10953
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
10954
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
10955
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
10697
10956
  default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
10698
10957
  }
10699
10958
  LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
@@ -10719,7 +10978,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10719
10978
  // K-quants
10720
10979
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10721
10980
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10722
- case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
10981
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
10723
10982
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10724
10983
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10725
10984
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -10730,9 +10989,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10730
10989
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10731
10990
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10732
10991
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10992
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
10993
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
10733
10994
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10734
10995
  case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10735
10996
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10997
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
10998
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
10999
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
10736
11000
 
10737
11001
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10738
11002
  }
@@ -10862,7 +11126,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10862
11126
  quantize &= !params->only_copy;
10863
11127
 
10864
11128
  // do not quantize expert gating tensors
10865
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
11129
+ // NOTE: can't use LLM_TN here because the layer number is not known
11130
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10866
11131
 
10867
11132
  // do not quantize positional embeddings and token types (BERT)
10868
11133
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
@@ -10906,6 +11171,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10906
11171
  }
10907
11172
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10908
11173
  new_type == GGML_TYPE_IQ2_XS ||
11174
+ new_type == GGML_TYPE_IQ2_S ||
10909
11175
  new_type == GGML_TYPE_IQ1_S ||
10910
11176
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10911
11177
  LLAMA_LOG_ERROR("\n\n============================================================\n");
@@ -11327,7 +11593,7 @@ static int llama_apply_lora_from_file_internal(
11327
11593
  struct llama_model_params llama_model_default_params() {
11328
11594
  struct llama_model_params result = {
11329
11595
  /*.n_gpu_layers =*/ 0,
11330
- /*.split_mode =*/ LLAMA_SPLIT_LAYER,
11596
+ /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11331
11597
  /*.main_gpu =*/ 0,
11332
11598
  /*.tensor_split =*/ nullptr,
11333
11599
  /*.progress_callback =*/ nullptr,
@@ -11353,7 +11619,7 @@ struct llama_context_params llama_context_default_params() {
11353
11619
  /*.n_batch =*/ 512,
11354
11620
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11355
11621
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11356
- /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
11622
+ /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
11357
11623
  /*.rope_freq_base =*/ 0.0f,
11358
11624
  /*.rope_freq_scale =*/ 0.0f,
11359
11625
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11361,11 +11627,11 @@ struct llama_context_params llama_context_default_params() {
11361
11627
  /*.yarn_beta_fast =*/ 32.0f,
11362
11628
  /*.yarn_beta_slow =*/ 1.0f,
11363
11629
  /*.yarn_orig_ctx =*/ 0,
11630
+ /*.defrag_thold =*/ -1.0f,
11364
11631
  /*.cb_eval =*/ nullptr,
11365
11632
  /*.cb_eval_user_data =*/ nullptr,
11366
11633
  /*.type_k =*/ GGML_TYPE_F16,
11367
11634
  /*.type_v =*/ GGML_TYPE_F16,
11368
- /*.mul_mat_q =*/ true,
11369
11635
  /*.logits_all =*/ false,
11370
11636
  /*.embedding =*/ false,
11371
11637
  /*.offload_kqv =*/ true,
@@ -11421,15 +11687,6 @@ bool llama_supports_gpu_offload(void) {
11421
11687
  #endif
11422
11688
  }
11423
11689
 
11424
- // deprecated:
11425
- bool llama_mmap_supported(void) {
11426
- return llama_supports_mmap();
11427
- }
11428
-
11429
- bool llama_mlock_supported(void) {
11430
- return llama_supports_mlock();
11431
- }
11432
-
11433
11690
  void llama_backend_init(void) {
11434
11691
  ggml_time_init();
11435
11692
 
@@ -11525,7 +11782,7 @@ struct llama_context * llama_new_context_with_model(
11525
11782
  cparams.yarn_attn_factor = params.yarn_attn_factor;
11526
11783
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11527
11784
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11528
- cparams.mul_mat_q = params.mul_mat_q;
11785
+ cparams.defrag_thold = params.defrag_thold;
11529
11786
  cparams.offload_kqv = params.offload_kqv;
11530
11787
  cparams.do_pooling = params.do_pooling;
11531
11788
 
@@ -11541,16 +11798,16 @@ struct llama_context * llama_new_context_with_model(
11541
11798
  cparams.cb_eval_user_data = params.cb_eval_user_data;
11542
11799
 
11543
11800
  auto rope_scaling_type = params.rope_scaling_type;
11544
- if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
11801
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
11545
11802
  rope_scaling_type = hparams.rope_scaling_type_train;
11546
11803
  }
11547
11804
 
11548
- if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
11805
+ if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
11549
11806
  cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
11550
11807
  }
11551
11808
 
11552
11809
  if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
11553
- cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
11810
+ cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11554
11811
  }
11555
11812
 
11556
11813
  if (params.seed == LLAMA_DEFAULT_SEED) {
@@ -11584,8 +11841,8 @@ struct llama_context * llama_new_context_with_model(
11584
11841
  }
11585
11842
  #elif defined(GGML_USE_CUBLAS)
11586
11843
  if (model->n_gpu_layers > 0) {
11587
- // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
11588
- if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
11844
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
11845
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
11589
11846
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
11590
11847
  if (backend == nullptr) {
11591
11848
  LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
@@ -11594,7 +11851,7 @@ struct llama_context * llama_new_context_with_model(
11594
11851
  }
11595
11852
  ctx->backends.push_back(backend);
11596
11853
  } else {
11597
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
11854
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
11598
11855
  for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
11599
11856
  ggml_backend_t backend = ggml_backend_cuda_init(device);
11600
11857
  if (backend == nullptr) {
@@ -11647,8 +11904,7 @@ struct llama_context * llama_new_context_with_model(
11647
11904
  }
11648
11905
  ctx->backends.push_back(ctx->backend_cpu);
11649
11906
 
11650
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
11651
- cparams.n_ctx, cparams.offload_kqv)) {
11907
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
11652
11908
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
11653
11909
  llama_free(ctx);
11654
11910
  return nullptr;
@@ -11727,7 +11983,7 @@ struct llama_context * llama_new_context_with_model(
11727
11983
  }
11728
11984
 
11729
11985
  // buffer used to store the computation graph and the tensor meta data
11730
- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
11986
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
11731
11987
 
11732
11988
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
11733
11989
 
@@ -11796,6 +12052,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
11796
12052
  return model->vocab.type;
11797
12053
  }
11798
12054
 
12055
+ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12056
+ switch (model->arch) {
12057
+ // these models do not use RoPE
12058
+ case LLM_ARCH_GPT2:
12059
+ case LLM_ARCH_GPTJ:
12060
+ case LLM_ARCH_GPTNEOX:
12061
+ case LLM_ARCH_MPT:
12062
+ case LLM_ARCH_REFACT:
12063
+ case LLM_ARCH_BLOOM:
12064
+ return LLAMA_ROPE_TYPE_NONE;
12065
+
12066
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
12067
+ case LLM_ARCH_LLAMA:
12068
+ case LLM_ARCH_BAICHUAN:
12069
+ case LLM_ARCH_STARCODER:
12070
+ case LLM_ARCH_PLAMO:
12071
+ case LLM_ARCH_CODESHELL:
12072
+ case LLM_ARCH_ORION:
12073
+ case LLM_ARCH_INTERNLM2:
12074
+ case LLM_ARCH_MINICPM:
12075
+ return LLAMA_ROPE_TYPE_NORM;
12076
+
12077
+ // the pairs of head values are offset by n_rot/2
12078
+ case LLM_ARCH_FALCON:
12079
+ case LLM_ARCH_PERSIMMON:
12080
+ case LLM_ARCH_BERT:
12081
+ case LLM_ARCH_NOMIC_BERT:
12082
+ case LLM_ARCH_STABLELM:
12083
+ case LLM_ARCH_QWEN:
12084
+ case LLM_ARCH_QWEN2:
12085
+ case LLM_ARCH_PHI2:
12086
+ case LLM_ARCH_GEMMA:
12087
+ return LLAMA_ROPE_TYPE_NEOX;
12088
+
12089
+ // all model arches should be listed explicitly here
12090
+ case LLM_ARCH_UNKNOWN:
12091
+ GGML_ASSERT(false && "unknown architecture");
12092
+ break;
12093
+ }
12094
+
12095
+ return LLAMA_ROPE_TYPE_NONE;
12096
+ }
12097
+
11799
12098
  int32_t llama_n_vocab(const struct llama_model * model) {
11800
12099
  return model->vocab.id_to_token.size();
11801
12100
  }
@@ -11898,15 +12197,6 @@ uint32_t llama_model_quantize(
11898
12197
  }
11899
12198
  }
11900
12199
 
11901
- int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11902
- try {
11903
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
11904
- } catch (const std::exception & err) {
11905
- LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
11906
- return 1;
11907
- }
11908
- }
11909
-
11910
12200
  int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
11911
12201
  try {
11912
12202
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@@ -12038,12 +12328,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
12038
12328
  llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
12039
12329
  }
12040
12330
 
12041
- void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12331
+ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
12042
12332
  if (delta == 0) {
12043
12333
  return;
12044
12334
  }
12045
12335
 
12046
- llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
12336
+ llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
12047
12337
  }
12048
12338
 
12049
12339
  void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
@@ -12054,6 +12344,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
12054
12344
  llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
12055
12345
  }
12056
12346
 
12347
+ llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
12348
+ return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
12349
+ }
12350
+
12351
+ void llama_kv_cache_defrag(struct llama_context * ctx) {
12352
+ llama_kv_cache_defrag(ctx->kv_self);
12353
+ }
12354
+
12355
+ void llama_kv_cache_update(struct llama_context * ctx) {
12356
+ llama_kv_cache_update_internal(*ctx);
12357
+ }
12358
+
12359
+
12057
12360
  // Returns the *maximum* size of the state
12058
12361
  size_t llama_get_state_size(const struct llama_context * ctx) {
12059
12362
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -12180,10 +12483,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12180
12483
  const auto & hparams = ctx->model.hparams;
12181
12484
  const auto & cparams = ctx->cparams;
12182
12485
 
12183
- const auto n_layer = hparams.n_layer;
12184
- const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
12185
- const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
12186
- const auto n_ctx = cparams.n_ctx;
12486
+ const uint32_t n_layer = hparams.n_layer;
12487
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12488
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12489
+ const uint32_t n_ctx = cparams.n_ctx;
12187
12490
 
12188
12491
  const size_t kv_buf_size = kv_self.total_size();
12189
12492
  const uint32_t kv_head = kv_self.head;
@@ -12198,14 +12501,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12198
12501
  if (kv_buf_size) {
12199
12502
  std::vector<uint8_t> tmp_buf;
12200
12503
  for (int il = 0; il < (int) n_layer; ++il) {
12201
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12504
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12505
+
12202
12506
  tmp_buf.resize(k_size);
12203
12507
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12204
12508
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12205
12509
 
12206
12510
  // v is not contiguous, copy row by row
12207
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12208
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12511
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12512
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12513
+
12209
12514
  tmp_buf.resize(v_row_size);
12210
12515
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12211
12516
  ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
@@ -12238,8 +12543,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
12238
12543
  }
12239
12544
 
12240
12545
  // Sets the state reading from the specified source address
12241
- size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12242
- uint8_t * inp = src;
12546
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12547
+ const uint8_t * inp = src;
12243
12548
 
12244
12549
  // set rng
12245
12550
  {
@@ -12248,7 +12553,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12248
12553
 
12249
12554
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
12250
12555
 
12251
- std::string rng_str((char *)inp, rng_size); inp += rng_size;
12556
+ std::string rng_str((const char *)inp, rng_size); inp += rng_size;
12252
12557
 
12253
12558
  std::istringstream rng_ss(rng_str);
12254
12559
  rng_ss >> ctx->rng;
@@ -12292,10 +12597,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12292
12597
  const auto & hparams = ctx->model.hparams;
12293
12598
  const auto & cparams = ctx->cparams;
12294
12599
 
12295
- const int n_layer = hparams.n_layer;
12296
- const int n_embd_k_gqa = hparams.n_embd_k_gqa();
12297
- const int n_embd_v_gqa = hparams.n_embd_v_gqa();
12298
- const int n_ctx = cparams.n_ctx;
12600
+ const uint32_t n_layer = hparams.n_layer;
12601
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12602
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12603
+ const uint32_t n_ctx = cparams.n_ctx;
12299
12604
 
12300
12605
  size_t kv_buf_size;
12301
12606
  uint32_t kv_head;
@@ -12311,13 +12616,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12311
12616
  GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12312
12617
 
12313
12618
  for (int il = 0; il < (int) n_layer; ++il) {
12314
- size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12619
+ const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12620
+
12315
12621
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12316
12622
  inp += k_size;
12317
12623
 
12318
12624
  // v is not contiguous, copy row by row
12319
- size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12320
- size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12625
+ const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12626
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12627
+
12321
12628
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12322
12629
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
12323
12630
  inp += v_row_size;
@@ -12439,38 +12746,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
12439
12746
  return true;
12440
12747
  }
12441
12748
 
12442
- int llama_eval(
12443
- struct llama_context * ctx,
12444
- llama_token * tokens,
12445
- int32_t n_tokens,
12446
- int32_t n_past) {
12447
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12448
-
12449
- const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
12450
- if (ret < 0) {
12451
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12452
- }
12453
-
12454
- return ret;
12455
- }
12456
-
12457
- int llama_eval_embd(
12458
- struct llama_context * ctx,
12459
- float * embd,
12460
- int32_t n_tokens,
12461
- int32_t n_past) {
12462
- llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
12463
-
12464
- llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
12465
-
12466
- const int ret = llama_decode_internal(*ctx, batch);
12467
- if (ret < 0) {
12468
- LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
12469
- }
12470
-
12471
- return ret;
12472
- }
12473
-
12474
12749
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
12475
12750
  ctx->cparams.n_threads = n_threads;
12476
12751
  ctx->cparams.n_threads_batch = n_threads_batch;