llama_cpp 0.9.2 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -91,6 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
+ #define LLAMA_MAX_NODES 8192
95
+
94
96
  //
95
97
  // logging
96
98
  //
@@ -190,6 +192,7 @@ enum llm_arch {
190
192
  LLM_ARCH_PERSIMMON,
191
193
  LLM_ARCH_REFACT,
192
194
  LLM_ARCH_BLOOM,
195
+ LLM_ARCH_STABLELM,
193
196
  LLM_ARCH_UNKNOWN,
194
197
  };
195
198
 
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
205
208
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
209
  { LLM_ARCH_REFACT, "refact" },
207
210
  { LLM_ARCH_BLOOM, "bloom" },
211
+ { LLM_ARCH_STABLELM, "stablelm" },
208
212
  };
209
213
 
210
214
  enum llm_kv {
@@ -251,6 +255,8 @@ enum llm_kv {
251
255
  LLM_KV_TOKENIZER_UNK_ID,
252
256
  LLM_KV_TOKENIZER_SEP_ID,
253
257
  LLM_KV_TOKENIZER_PAD_ID,
258
+ LLM_KV_TOKENIZER_ADD_BOS,
259
+ LLM_KV_TOKENIZER_ADD_EOS,
254
260
  LLM_KV_TOKENIZER_HF_JSON,
255
261
  LLM_KV_TOKENIZER_RWKV,
256
262
  };
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
299
305
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
300
306
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
301
307
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
308
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
309
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
302
310
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
303
311
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
304
312
  };
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
493
501
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
494
502
  },
495
503
  },
504
+ {
505
+ LLM_ARCH_STABLELM,
506
+ {
507
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
508
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
509
+ { LLM_TENSOR_OUTPUT, "output" },
510
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
511
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
512
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
513
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
514
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
515
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
516
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
517
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
518
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
519
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
520
+ },
521
+ },
522
+
496
523
  {
497
524
  LLM_ARCH_UNKNOWN,
498
525
  {
@@ -577,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
577
604
  return LLAMA_ROPE_SCALING_UNSPECIFIED;
578
605
  }
579
606
 
607
+ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
608
+ switch (type) {
609
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
610
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
611
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
612
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
613
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
614
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
615
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
616
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
617
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
618
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
619
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
620
+ default: return format("unknown type %d", type);
621
+ }
622
+ }
623
+
624
+ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
625
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
626
+
627
+ switch (type) {
628
+ case GGUF_TYPE_STRING:
629
+ return gguf_get_val_str(ctx_gguf, i);
630
+ case GGUF_TYPE_ARRAY:
631
+ {
632
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
633
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
634
+ const void * data = gguf_get_arr_data(ctx_gguf, i);
635
+ std::stringstream ss;
636
+ ss << "[";
637
+ for (int j = 0; j < arr_n; j++) {
638
+ if (arr_type == GGUF_TYPE_STRING) {
639
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
640
+ // escape quotes
641
+ replace_all(val, "\\", "\\\\");
642
+ replace_all(val, "\"", "\\\"");
643
+ ss << '"' << val << '"';
644
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
645
+ ss << "???";
646
+ } else {
647
+ ss << gguf_data_to_str(arr_type, data, j);
648
+ }
649
+ if (j < arr_n - 1) {
650
+ ss << ", ";
651
+ }
652
+ }
653
+ ss << "]";
654
+ return ss.str();
655
+ }
656
+ default:
657
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
658
+ }
659
+ }
660
+
580
661
  //
581
662
  // ggml helpers
582
663
  //
@@ -1055,9 +1136,9 @@ enum e_model {
1055
1136
  MODEL_70B,
1056
1137
  };
1057
1138
 
1058
- static const size_t kB = 1024;
1059
- static const size_t MB = 1024*kB;
1060
- static const size_t GB = 1024*MB;
1139
+ static const size_t kiB = 1024;
1140
+ static const size_t MiB = 1024*kiB;
1141
+ static const size_t GiB = 1024*MiB;
1061
1142
 
1062
1143
  struct llama_hparams {
1063
1144
  bool vocab_only;
@@ -1194,6 +1275,7 @@ struct llama_kv_cache {
1194
1275
  // cannot be freely changed after a slot has been allocated.
1195
1276
  uint32_t head = 0;
1196
1277
  uint32_t size = 0;
1278
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
1197
1279
 
1198
1280
  // computed before each graph build
1199
1281
  uint32_t n = 0;
@@ -1248,6 +1330,9 @@ struct llama_vocab {
1248
1330
  id special_sep_id = -1;
1249
1331
  id special_pad_id = -1;
1250
1332
 
1333
+ int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1334
+ int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
1335
+
1251
1336
  id linefeed_id = 13;
1252
1337
  id special_prefix_id = 32007;
1253
1338
  id special_middle_id = 32009;
@@ -1292,6 +1377,9 @@ struct llama_model {
1292
1377
 
1293
1378
  int n_gpu_layers;
1294
1379
 
1380
+ // gguf metadata
1381
+ std::unordered_map<std::string, std::string> gguf_kv;
1382
+
1295
1383
  // context
1296
1384
  struct ggml_context * ctx = NULL;
1297
1385
 
@@ -1412,6 +1500,7 @@ static bool llama_kv_cache_init(
1412
1500
 
1413
1501
  cache.head = 0;
1414
1502
  cache.size = n_ctx;
1503
+ cache.used = 0;
1415
1504
 
1416
1505
  cache.cells.clear();
1417
1506
  cache.cells.resize(n_ctx);
@@ -1453,7 +1542,7 @@ static bool llama_kv_cache_init(
1453
1542
  vram_kv_cache += ggml_nbytes(cache.k);
1454
1543
  }
1455
1544
  if (vram_kv_cache > 0) {
1456
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1545
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1457
1546
  }
1458
1547
  }
1459
1548
  #endif
@@ -1513,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
1513
1602
  }
1514
1603
  }
1515
1604
 
1605
+ cache.used += n_tokens;
1606
+
1516
1607
  return true;
1517
1608
  }
1518
1609
 
@@ -1533,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
1533
1624
  cache.cells[i].seq_id.clear();
1534
1625
  }
1535
1626
  cache.head = 0;
1627
+ cache.used = 0;
1536
1628
  }
1537
1629
 
1538
1630
  static void llama_kv_cache_seq_rm(
@@ -1555,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
1555
1647
  continue;
1556
1648
  }
1557
1649
  if (cache.cells[i].seq_id.empty()) {
1650
+ // keep count of the number of used cells
1651
+ if (cache.cells[i].pos >= 0) cache.used--;
1652
+
1558
1653
  cache.cells[i].pos = -1;
1559
1654
  if (new_head == cache.size) new_head = i;
1560
1655
  }
@@ -1562,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
1562
1657
  }
1563
1658
 
1564
1659
  // If we freed up a slot, set head to it so searching can start there.
1565
- if (new_head != cache.size) cache.head = new_head;
1660
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1566
1661
  }
1567
1662
 
1568
1663
  static void llama_kv_cache_seq_cp(
@@ -1588,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1588
1683
 
1589
1684
  for (uint32_t i = 0; i < cache.size; ++i) {
1590
1685
  if (!cache.cells[i].has_seq_id(seq_id)) {
1686
+ if (cache.cells[i].pos >= 0) cache.used--;
1591
1687
  cache.cells[i].pos = -1;
1592
1688
  cache.cells[i].seq_id.clear();
1593
1689
  if (new_head == cache.size) new_head = i;
@@ -1598,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1598
1694
  }
1599
1695
 
1600
1696
  // If we freed up a slot, set head to it so searching can start there.
1601
- if (new_head != cache.size) cache.head = new_head;
1697
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1602
1698
  }
1603
1699
 
1604
1700
  static void llama_kv_cache_seq_shift(
@@ -1619,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
1619
1715
  cache.cells[i].delta += delta;
1620
1716
 
1621
1717
  if (cache.cells[i].pos < 0) {
1718
+ if (!cache.cells[i].seq_id.empty()) cache.used--;
1622
1719
  cache.cells[i].pos = -1;
1623
1720
  cache.cells[i].seq_id.clear();
1624
1721
  if (new_head == cache.size) new_head = i;
@@ -1750,10 +1847,10 @@ struct llama_model_loader {
1750
1847
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
1751
1848
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
1752
1849
  default:
1753
- {
1754
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1755
- ftype = LLAMA_FTYPE_ALL_F32;
1756
- } break;
1850
+ {
1851
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1852
+ ftype = LLAMA_FTYPE_ALL_F32;
1853
+ } break;
1757
1854
  }
1758
1855
 
1759
1856
  // this is a way to mark that we have "guessed" the file type
@@ -1767,10 +1864,21 @@ struct llama_model_loader {
1767
1864
  }
1768
1865
 
1769
1866
  for (int i = 0; i < n_kv; i++) {
1770
- const char * name = gguf_get_key(ctx_gguf, i);
1771
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1867
+ const char * name = gguf_get_key(ctx_gguf, i);
1868
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1869
+ const std::string type_name =
1870
+ type == GGUF_TYPE_ARRAY
1871
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
1872
+ : gguf_type_name(type);
1873
+
1874
+ std::string value = gguf_kv_to_str(ctx_gguf, i);
1875
+ const size_t MAX_VALUE_LEN = 40;
1876
+ if (value.size() > MAX_VALUE_LEN) {
1877
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1878
+ }
1879
+ replace_all(value, "\n", "\\n");
1772
1880
 
1773
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
1881
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1774
1882
  }
1775
1883
 
1776
1884
  // print type counts
@@ -2065,6 +2173,17 @@ static void llm_load_hparams(
2065
2173
 
2066
2174
  auto & hparams = model.hparams;
2067
2175
 
2176
+ // get metadata as string
2177
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
2178
+ enum gguf_type type = gguf_get_kv_type(ctx, i);
2179
+ if (type == GGUF_TYPE_ARRAY) {
2180
+ continue;
2181
+ }
2182
+ const char * name = gguf_get_key(ctx, i);
2183
+ const std::string value = gguf_kv_to_str(ctx, i);
2184
+ model.gguf_kv.emplace(name, value);
2185
+ }
2186
+
2068
2187
  // get general kv
2069
2188
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2070
2189
 
@@ -2209,6 +2328,16 @@ static void llm_load_hparams(
2209
2328
  default: model.type = e_model::MODEL_UNKNOWN;
2210
2329
  }
2211
2330
  } break;
2331
+ case LLM_ARCH_STABLELM:
2332
+ {
2333
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2334
+
2335
+ switch (hparams.n_layer) {
2336
+ case 32: model.type = e_model::MODEL_3B; break;
2337
+ default: model.type = e_model::MODEL_UNKNOWN;
2338
+ }
2339
+ } break;
2340
+
2212
2341
  default: (void)0;
2213
2342
  }
2214
2343
 
@@ -2350,6 +2479,23 @@ static void llm_load_vocab(
2350
2479
  __func__, key.c_str(), id, old_id);
2351
2480
  id = old_id;
2352
2481
  }
2482
+
2483
+ }
2484
+
2485
+ // Handle add_bos_token and add_eos_token
2486
+ std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2487
+ int kid = gguf_find_key(ctx, key.c_str());
2488
+ enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2489
+ vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2490
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2491
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2492
+ }
2493
+ key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2494
+ kid = gguf_find_key(ctx, key.c_str());
2495
+ ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2496
+ vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2497
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2498
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2353
2499
  }
2354
2500
  }
2355
2501
 
@@ -2481,8 +2627,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2481
2627
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2482
2628
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2483
2629
  LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2484
- if (ml.n_bytes < GB) {
2485
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2630
+ if (ml.n_bytes < GiB) {
2631
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2486
2632
  } else {
2487
2633
  LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2488
2634
  }
@@ -2520,7 +2666,7 @@ static void llm_load_tensors(
2520
2666
 
2521
2667
  ml.calc_sizes(ctx_size, mmapped_size);
2522
2668
 
2523
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2669
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2524
2670
 
2525
2671
  // create the ggml context
2526
2672
  {
@@ -2872,6 +3018,13 @@ static void llm_load_tensors(
2872
3018
  ggml_backend_type backend_output;
2873
3019
 
2874
3020
  if (n_gpu_layers > int(n_layer)) {
3021
+ #ifdef GGML_USE_CUBLAS
3022
+ if (n_gpu_layers > int(n_layer + 1)) {
3023
+ LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
3024
+ __func__, n_layer + 1);
3025
+ throw std::runtime_error("Persimmon CUDA offload failed");
3026
+ }
3027
+ #endif
2875
3028
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2876
3029
  // on Windows however this is detrimental unless everything is on the GPU
2877
3030
  #ifndef _WIN32
@@ -3073,6 +3226,81 @@ static void llm_load_tensors(
3073
3226
  }
3074
3227
  }
3075
3228
  } break;
3229
+ case LLM_ARCH_STABLELM:
3230
+ {
3231
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3232
+
3233
+ // output
3234
+ {
3235
+ ggml_backend_type backend_norm;
3236
+ ggml_backend_type backend_output;
3237
+
3238
+ if (n_gpu_layers > int(n_layer)) {
3239
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3240
+ // on Windows however this is detrimental unless everything is on the GPU
3241
+ #ifndef _WIN32
3242
+ backend_norm = llama_backend_offload;
3243
+ #else
3244
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3245
+ #endif // _WIN32
3246
+
3247
+ backend_output = llama_backend_offload_split;
3248
+ } else {
3249
+ backend_norm = GGML_BACKEND_CPU;
3250
+ backend_output = GGML_BACKEND_CPU;
3251
+ }
3252
+
3253
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3254
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3255
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3256
+
3257
+ if (backend_norm == GGML_BACKEND_GPU) {
3258
+ vram_weights += ggml_nbytes(model.output_norm);
3259
+ }
3260
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3261
+ vram_weights += ggml_nbytes(model.output);
3262
+ }
3263
+ }
3264
+
3265
+ const uint32_t n_ff = hparams.n_ff;
3266
+
3267
+ const int i_gpu_start = n_layer - n_gpu_layers;
3268
+
3269
+ model.layers.resize(n_layer);
3270
+
3271
+ for (uint32_t i = 0; i < n_layer; ++i) {
3272
+ /*
3273
+ llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3274
+ */
3275
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3276
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3277
+
3278
+ auto & layer = model.layers[i];
3279
+
3280
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3281
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3282
+
3283
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3284
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3285
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3286
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3287
+
3288
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3289
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3290
+
3291
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3292
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3293
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3294
+
3295
+ if (backend == GGML_BACKEND_GPU) {
3296
+ vram_weights +=
3297
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3298
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3299
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3300
+ }
3301
+ }
3302
+ } break;
3303
+
3076
3304
  default:
3077
3305
  throw std::runtime_error("unknown architecture");
3078
3306
  }
@@ -3087,7 +3315,7 @@ static void llm_load_tensors(
3087
3315
  ctx_size +
3088
3316
  mmapped_size - vram_weights; // weights in VRAM not in memory
3089
3317
 
3090
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3318
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3091
3319
 
3092
3320
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3093
3321
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3106,7 +3334,7 @@ static void llm_load_tensors(
3106
3334
  #endif // GGML_USE_CUBLAS
3107
3335
 
3108
3336
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3109
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3337
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3110
3338
  #else
3111
3339
  (void) n_gpu_layers;
3112
3340
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3606,7 +3834,7 @@ struct llm_build_context {
3606
3834
  }
3607
3835
 
3608
3836
  struct ggml_cgraph * build_llama() {
3609
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3837
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3610
3838
 
3611
3839
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3612
3840
 
@@ -3718,7 +3946,7 @@ struct llm_build_context {
3718
3946
  }
3719
3947
 
3720
3948
  struct ggml_cgraph * build_baichuan() {
3721
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3949
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3722
3950
 
3723
3951
  struct ggml_tensor * cur;
3724
3952
  struct ggml_tensor * inpL;
@@ -3838,7 +4066,7 @@ struct llm_build_context {
3838
4066
  }
3839
4067
 
3840
4068
  struct ggml_cgraph * build_falcon() {
3841
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4069
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3842
4070
 
3843
4071
  struct ggml_tensor * cur;
3844
4072
  struct ggml_tensor * inpL;
@@ -3960,7 +4188,7 @@ struct llm_build_context {
3960
4188
  }
3961
4189
 
3962
4190
  struct ggml_cgraph * build_starcoder() {
3963
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4191
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3964
4192
 
3965
4193
  struct ggml_tensor * cur;
3966
4194
  struct ggml_tensor * pos;
@@ -4059,7 +4287,7 @@ struct llm_build_context {
4059
4287
  }
4060
4288
 
4061
4289
  struct ggml_cgraph * build_persimmon() {
4062
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4290
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4063
4291
 
4064
4292
  const int64_t n_rot = n_embd_head / 2;
4065
4293
 
@@ -4204,7 +4432,7 @@ struct llm_build_context {
4204
4432
  struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4205
4433
  cb(Kcur, "Kcur", il);
4206
4434
 
4207
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4435
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4208
4436
  cb(Q, "Q", il);
4209
4437
 
4210
4438
  Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4269,7 +4497,7 @@ struct llm_build_context {
4269
4497
  }
4270
4498
 
4271
4499
  struct ggml_cgraph * build_refact() {
4272
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4500
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4273
4501
 
4274
4502
  struct ggml_tensor * cur;
4275
4503
  struct ggml_tensor * inpL;
@@ -4360,7 +4588,7 @@ struct llm_build_context {
4360
4588
  }
4361
4589
 
4362
4590
  struct ggml_cgraph * build_bloom() {
4363
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4591
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4364
4592
 
4365
4593
  struct ggml_tensor * cur;
4366
4594
  struct ggml_tensor * inpL;
@@ -4454,7 +4682,7 @@ struct llm_build_context {
4454
4682
  }
4455
4683
 
4456
4684
  struct ggml_cgraph * build_mpt() {
4457
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4685
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4458
4686
 
4459
4687
  struct ggml_tensor * cur;
4460
4688
  struct ggml_tensor * inpL;
@@ -4551,6 +4779,119 @@ struct llm_build_context {
4551
4779
 
4552
4780
  return gf;
4553
4781
  }
4782
+
4783
+ struct ggml_cgraph * build_stablelm() {
4784
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4785
+
4786
+ struct ggml_tensor * cur;
4787
+ struct ggml_tensor * inpL;
4788
+
4789
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4790
+ cb(inpL, "inp_embd", -1);
4791
+
4792
+ // inp_pos - contains the positions
4793
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4794
+ cb(inp_pos, "inp_pos", -1);
4795
+
4796
+ // KQ_scale
4797
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4798
+ cb(KQ_scale, "KQ_scale", -1);
4799
+
4800
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4801
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4802
+ cb(KQ_mask, "KQ_mask", -1);
4803
+
4804
+ // shift the entire K-cache if needed
4805
+ if (do_rope_shift) {
4806
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
4807
+ }
4808
+
4809
+ for (int il = 0; il < n_layer; ++il) {
4810
+ struct ggml_tensor * inpSA = inpL;
4811
+
4812
+ // norm
4813
+ cur = llm_build_norm(ctx0, inpL, hparams,
4814
+ model.layers[il].attn_norm,
4815
+ model.layers[il].attn_norm_b,
4816
+ LLM_NORM, cb, il);
4817
+ cb(cur, "attn_norm", il);
4818
+
4819
+ // self-attention
4820
+ {
4821
+ // compute Q and K and RoPE them
4822
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4823
+ cb(Qcur, "Qcur", il);
4824
+
4825
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4826
+ cb(Kcur, "Kcur", il);
4827
+
4828
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4829
+ cb(Vcur, "Vcur", il);
4830
+
4831
+ Qcur = ggml_rope_custom(
4832
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4833
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4834
+ ext_factor, attn_factor, beta_fast, beta_slow
4835
+ );
4836
+ cb(Qcur, "Qcur", il);
4837
+
4838
+ Kcur = ggml_rope_custom(
4839
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4840
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4841
+ ext_factor, attn_factor, beta_fast, beta_slow
4842
+ );
4843
+ cb(Kcur, "Kcur", il);
4844
+
4845
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4846
+
4847
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
4848
+ model.layers[il].wo, NULL,
4849
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4850
+ cb(cur, "kqv_out", il);
4851
+ }
4852
+
4853
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4854
+ cb(ffn_inp, "ffn_inp", il);
4855
+
4856
+ // feed-forward network
4857
+ {
4858
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4859
+ model.layers[il].ffn_norm,
4860
+ model.layers[il].ffn_norm_b,
4861
+ LLM_NORM, cb, il);
4862
+ cb(cur, "ffn_norm", il);
4863
+
4864
+ cur = llm_build_ffn(ctx0, cur,
4865
+ model.layers[il].ffn_up, NULL,
4866
+ model.layers[il].ffn_gate, NULL,
4867
+ model.layers[il].ffn_down, NULL,
4868
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4869
+ cb(cur, "ffn_out", il);
4870
+ }
4871
+
4872
+ cur = ggml_add(ctx0, cur, ffn_inp);
4873
+ cb(cur, "l_out", il);
4874
+
4875
+ // input for next layer
4876
+ inpL = cur;
4877
+ }
4878
+
4879
+ cur = inpL;
4880
+
4881
+ cur = llm_build_norm(ctx0, cur, hparams,
4882
+ model.output_norm,
4883
+ model.output_norm_b,
4884
+ LLM_NORM, cb, -1);
4885
+ cb(cur, "result_norm", -1);
4886
+
4887
+ // lm_head
4888
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4889
+ cb(cur, "result_output", -1);
4890
+
4891
+ ggml_build_forward_expand(gf, cur);
4892
+
4893
+ return gf;
4894
+ }
4554
4895
  };
4555
4896
 
4556
4897
  //
@@ -5020,6 +5361,10 @@ static struct ggml_cgraph * llama_build_graph(
5020
5361
  {
5021
5362
  result = llm.build_mpt();
5022
5363
  } break;
5364
+ case LLM_ARCH_STABLELM:
5365
+ {
5366
+ result = llm.build_stablelm();
5367
+ } break;
5023
5368
  default:
5024
5369
  GGML_ASSERT(false);
5025
5370
  }
@@ -5129,6 +5474,12 @@ static int llama_decode_internal(
5129
5474
  batch.seq_id = seq_id_arr.data();
5130
5475
  }
5131
5476
 
5477
+ // if we have enough unused cells before the current head ->
5478
+ // better to start searching from the beginning of the cache, hoping to fill it
5479
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
5480
+ kv_self.head = 0;
5481
+ }
5482
+
5132
5483
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
5133
5484
  return 1;
5134
5485
  }
@@ -5139,7 +5490,7 @@ static int llama_decode_internal(
5139
5490
  //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5140
5491
  kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5141
5492
 
5142
- //printf("kv_self.n = %d\n", kv_self.n);
5493
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5143
5494
 
5144
5495
  ggml_allocr_reset(lctx.alloc);
5145
5496
 
@@ -5195,7 +5546,8 @@ static int llama_decode_internal(
5195
5546
  model.arch == LLM_ARCH_FALCON ||
5196
5547
  model.arch == LLM_ARCH_REFACT ||
5197
5548
  model.arch == LLM_ARCH_MPT ||
5198
- model.arch == LLM_ARCH_STARCODER;
5549
+ model.arch == LLM_ARCH_STARCODER ||
5550
+ model.arch == LLM_ARCH_STABLELM;
5199
5551
 
5200
5552
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5201
5553
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -5987,7 +6339,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5987
6339
  // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
5988
6340
  // and passing 'add space prefix' as bool argument
5989
6341
  //
5990
- auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6342
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6343
+ if (&fragment == &fragment_buffer.front()) {
6344
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
6345
+ }
5991
6346
 
5992
6347
  #ifdef PRETOKENIZERDEBUG
5993
6348
  fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
@@ -7639,7 +7994,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7639
7994
  workers.clear();
7640
7995
  }
7641
7996
 
7642
- LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7997
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7643
7998
  int64_t tot_count = 0;
7644
7999
  for (size_t i = 0; i < hist_cur.size(); i++) {
7645
8000
  hist_all[i] += hist_cur[i];
@@ -8179,7 +8534,7 @@ struct llama_context * llama_new_context_with_model(
8179
8534
 
8180
8535
  {
8181
8536
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8182
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8537
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8183
8538
  }
8184
8539
 
8185
8540
  // resized during inference
@@ -8196,7 +8551,7 @@ struct llama_context * llama_new_context_with_model(
8196
8551
  {
8197
8552
  static const size_t tensor_alignment = 32;
8198
8553
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8199
- ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
8554
+ ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
8200
8555
 
8201
8556
  // create measure allocator
8202
8557
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
@@ -8224,7 +8579,7 @@ struct llama_context * llama_new_context_with_model(
8224
8579
  // measure memory requirements for the graph
8225
8580
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
8226
8581
 
8227
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8582
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8228
8583
 
8229
8584
  // recreate allocator with exact memory requirements
8230
8585
  ggml_allocr_free(ctx->alloc);
@@ -8238,7 +8593,7 @@ struct llama_context * llama_new_context_with_model(
8238
8593
  #endif
8239
8594
  #ifdef GGML_USE_CUBLAS
8240
8595
  ggml_cuda_set_scratch_size(alloc_size);
8241
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8596
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
8242
8597
 
8243
8598
  // calculate total VRAM usage
8244
8599
  auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8258,10 +8613,10 @@ struct llama_context * llama_new_context_with_model(
8258
8613
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8259
8614
  size_t total_vram_size = model_vram_size + ctx_vram_size;
8260
8615
 
8261
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8616
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
8262
8617
  total_vram_size / 1024.0 / 1024.0,
8263
8618
  model_vram_size / 1024.0 / 1024.0,
8264
- ctx_vram_size / 1024.0 / 1024.0);
8619
+ ctx_vram_size / 1024.0 / 1024.0);
8265
8620
  #endif
8266
8621
  }
8267
8622
 
@@ -8282,7 +8637,7 @@ struct llama_context * llama_new_context_with_model(
8282
8637
 
8283
8638
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
8284
8639
 
8285
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8640
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
8286
8641
 
8287
8642
  #define LLAMA_METAL_CHECK_BUF(result) \
8288
8643
  if (!(result)) { \
@@ -8348,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
8348
8703
  return model->hparams.rope_freq_scale_train;
8349
8704
  }
8350
8705
 
8706
+ int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
8707
+ const auto & it = model->gguf_kv.find(key);
8708
+ if (it == model->gguf_kv.end()) {
8709
+ if (buf_size > 0) {
8710
+ buf[0] = '\0';
8711
+ }
8712
+ return -1;
8713
+ }
8714
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8715
+ }
8716
+
8717
+ int llama_model_meta_count(const struct llama_model * model) {
8718
+ return (int)model->gguf_kv.size();
8719
+ }
8720
+
8721
+ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8722
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8723
+ if (buf_size > 0) {
8724
+ buf[0] = '\0';
8725
+ }
8726
+ return -1;
8727
+ }
8728
+ auto it = model->gguf_kv.begin();
8729
+ std::advance(it, i);
8730
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
8731
+ }
8732
+
8733
+ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8734
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8735
+ if (buf_size > 0) {
8736
+ buf[0] = '\0';
8737
+ }
8738
+ return -1;
8739
+ }
8740
+ auto it = model->gguf_kv.begin();
8741
+ std::advance(it, i);
8742
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8743
+ }
8744
+
8351
8745
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
8352
8746
  return snprintf(buf, buf_size, "%s %s %s",
8353
8747
  llama_model_arch_name(model->arch).c_str(),
@@ -8406,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
8406
8800
  }
8407
8801
  }
8408
8802
 
8803
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
8804
+ struct llama_kv_cache_view result = {
8805
+ /*.n_cells = */ 0,
8806
+ /*.n_max_seq = */ n_max_seq,
8807
+ /*.token_count = */ 0,
8808
+ /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
8809
+ /*.max_contiguous = */ 0,
8810
+ /*.max_contiguous_idx = */ -1,
8811
+ /*.cells = */ nullptr,
8812
+ /*.cells_sequences = */ nullptr,
8813
+ };
8814
+ return result;
8815
+ }
8816
+
8817
+ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
8818
+ if (view->cells != nullptr) {
8819
+ free(view->cells);
8820
+ view->cells = nullptr;
8821
+ }
8822
+ if (view->cells_sequences != nullptr) {
8823
+ free(view->cells_sequences);
8824
+ view->cells_sequences = nullptr;
8825
+ }
8826
+ }
8827
+
8828
+ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
8829
+ if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
8830
+ view->n_cells = int32_t(ctx->kv_self.size);
8831
+ void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
8832
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
8833
+ view->cells = (struct llama_kv_cache_view_cell *)p;
8834
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
8835
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
8836
+ view->cells_sequences = (llama_seq_id *)p;
8837
+ }
8838
+
8839
+ const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
8840
+ llama_kv_cache_view_cell * c_curr = view->cells;
8841
+ llama_seq_id * cs_curr = view->cells_sequences;
8842
+ int32_t used_cells = 0;
8843
+ int32_t token_count = 0;
8844
+ int32_t curr_contig_idx = -1;
8845
+ uint32_t max_contig = 0;
8846
+ int32_t max_contig_idx = -1;
8847
+
8848
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
8849
+ const size_t curr_size = kv_cells[i].seq_id.size();
8850
+ token_count += curr_size;
8851
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
8852
+
8853
+ if (curr_size > 0) {
8854
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
8855
+ max_contig = i - curr_contig_idx;
8856
+ max_contig_idx = curr_contig_idx;
8857
+ }
8858
+ curr_contig_idx = -1;
8859
+ } else if (curr_contig_idx < 0) {
8860
+ curr_contig_idx = i;
8861
+ }
8862
+
8863
+ int seq_idx = 0;
8864
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
8865
+ if (seq_idx >= view->n_max_seq) {
8866
+ break;
8867
+ }
8868
+ cs_curr[seq_idx] = it;
8869
+ seq_idx++;
8870
+ }
8871
+ if (seq_idx != 0) {
8872
+ used_cells++;
8873
+ }
8874
+ for (; seq_idx < view->n_max_seq; seq_idx++) {
8875
+ cs_curr[seq_idx] = -1;
8876
+ }
8877
+ }
8878
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
8879
+ max_contig_idx = curr_contig_idx;
8880
+ max_contig = kv_cells.size() - curr_contig_idx;
8881
+ }
8882
+ view->max_contiguous = max_contig;
8883
+ view->max_contiguous_idx = max_contig_idx;
8884
+ view->token_count = token_count;
8885
+ view->used_cells = used_cells;
8886
+ if (uint32_t(used_cells) != ctx->kv_self.used) {
8887
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
8888
+ __func__, ctx->kv_self.used, used_cells);
8889
+ }
8890
+ }
8891
+
8409
8892
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
8410
- return ctx->kv_self.head;
8893
+ int result = 0;
8894
+
8895
+ for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
8896
+ result += ctx->kv_self.cells[i].seq_id.size();
8897
+ }
8898
+
8899
+ return result;
8900
+ }
8901
+
8902
+ int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
8903
+ return ctx->kv_self.used;
8411
8904
  }
8412
8905
 
8413
8906
  void llama_kv_cache_clear(struct llama_context * ctx) {
@@ -8577,16 +9070,18 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8577
9070
  const size_t kv_buf_size = kv_self.buf.size;
8578
9071
  const uint32_t kv_head = kv_self.head;
8579
9072
  const uint32_t kv_size = kv_self.size;
9073
+ const uint32_t kv_used = kv_self.used;
8580
9074
 
8581
9075
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8582
9076
  data_ctx->write(&kv_head, sizeof(kv_head));
8583
9077
  data_ctx->write(&kv_size, sizeof(kv_size));
9078
+ data_ctx->write(&kv_used, sizeof(kv_used));
8584
9079
 
8585
9080
  if (kv_buf_size) {
8586
9081
  const size_t elt_size = ggml_element_size(kv_self.k);
8587
9082
 
8588
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8589
- ggml_cgraph gf{};
9083
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9084
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8590
9085
 
8591
9086
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8592
9087
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -8604,9 +9099,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8604
9099
  kv_head, n_embd, n_layer,
8605
9100
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8606
9101
 
8607
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8608
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8609
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9102
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9103
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
9104
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8610
9105
 
8611
9106
  ggml_free(cpy_ctx);
8612
9107
 
@@ -8703,18 +9198,20 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8703
9198
  size_t kv_buf_size;
8704
9199
  uint32_t kv_head;
8705
9200
  uint32_t kv_size;
9201
+ uint32_t kv_used;
8706
9202
 
8707
9203
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
8708
9204
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
8709
9205
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
9206
+ memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
8710
9207
 
8711
9208
  if (kv_buf_size) {
8712
9209
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
8713
9210
 
8714
9211
  const size_t elt_size = ggml_element_size(kv_self.k);
8715
9212
 
8716
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8717
- ggml_cgraph gf{};
9213
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9214
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8718
9215
 
8719
9216
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8720
9217
  kin3d->data = (void *) inp;
@@ -8732,15 +9229,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8732
9229
  kv_head, n_embd, n_layer,
8733
9230
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8734
9231
 
8735
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
8736
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
8737
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9232
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9233
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9234
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8738
9235
 
8739
9236
  ggml_free(cpy_ctx);
8740
9237
  }
8741
9238
 
8742
9239
  ctx->kv_self.head = kv_head;
8743
9240
  ctx->kv_self.size = kv_size;
9241
+ ctx->kv_self.used = kv_used;
8744
9242
 
8745
9243
  ctx->kv_self.cells.resize(kv_size);
8746
9244
 
@@ -8989,6 +9487,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
8989
9487
  return model->vocab.linefeed_id;
8990
9488
  }
8991
9489
 
9490
+ int llama_add_bos_token(const struct llama_model * model) {
9491
+ return model->vocab.special_add_bos;
9492
+ }
9493
+
9494
+ int llama_add_eos_token(const struct llama_model * model) {
9495
+ return model->vocab.special_add_eos;
9496
+ }
9497
+
8992
9498
  llama_token llama_token_prefix(const struct llama_model * model) {
8993
9499
  return model->vocab.special_prefix_id;
8994
9500
  }