llama_cpp 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,6 +91,8 @@
91
91
  #define LLAMA_ATTRIBUTE_FORMAT(...)
92
92
  #endif
93
93
 
94
+ #define LLAMA_MAX_NODES 8192
95
+
94
96
  //
95
97
  // logging
96
98
  //
@@ -190,6 +192,7 @@ enum llm_arch {
190
192
  LLM_ARCH_PERSIMMON,
191
193
  LLM_ARCH_REFACT,
192
194
  LLM_ARCH_BLOOM,
195
+ LLM_ARCH_STABLELM,
193
196
  LLM_ARCH_UNKNOWN,
194
197
  };
195
198
 
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
205
208
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
209
  { LLM_ARCH_REFACT, "refact" },
207
210
  { LLM_ARCH_BLOOM, "bloom" },
211
+ { LLM_ARCH_STABLELM, "stablelm" },
208
212
  };
209
213
 
210
214
  enum llm_kv {
@@ -251,6 +255,8 @@ enum llm_kv {
251
255
  LLM_KV_TOKENIZER_UNK_ID,
252
256
  LLM_KV_TOKENIZER_SEP_ID,
253
257
  LLM_KV_TOKENIZER_PAD_ID,
258
+ LLM_KV_TOKENIZER_ADD_BOS,
259
+ LLM_KV_TOKENIZER_ADD_EOS,
254
260
  LLM_KV_TOKENIZER_HF_JSON,
255
261
  LLM_KV_TOKENIZER_RWKV,
256
262
  };
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
299
305
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
300
306
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
301
307
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
308
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
309
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
302
310
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
303
311
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
304
312
  };
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
493
501
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
494
502
  },
495
503
  },
504
+ {
505
+ LLM_ARCH_STABLELM,
506
+ {
507
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
508
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
509
+ { LLM_TENSOR_OUTPUT, "output" },
510
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
511
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
512
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
513
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
514
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
515
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
516
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
517
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
518
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
519
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
520
+ },
521
+ },
522
+
496
523
  {
497
524
  LLM_ARCH_UNKNOWN,
498
525
  {
@@ -577,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
577
604
  return LLAMA_ROPE_SCALING_UNSPECIFIED;
578
605
  }
579
606
 
607
+ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
608
+ switch (type) {
609
+ case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
610
+ case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
611
+ case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
612
+ case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
613
+ case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
614
+ case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
615
+ case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
616
+ case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
617
+ case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
618
+ case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
619
+ case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
620
+ default: return format("unknown type %d", type);
621
+ }
622
+ }
623
+
624
+ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
625
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
626
+
627
+ switch (type) {
628
+ case GGUF_TYPE_STRING:
629
+ return gguf_get_val_str(ctx_gguf, i);
630
+ case GGUF_TYPE_ARRAY:
631
+ {
632
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
633
+ int arr_n = gguf_get_arr_n(ctx_gguf, i);
634
+ const void * data = gguf_get_arr_data(ctx_gguf, i);
635
+ std::stringstream ss;
636
+ ss << "[";
637
+ for (int j = 0; j < arr_n; j++) {
638
+ if (arr_type == GGUF_TYPE_STRING) {
639
+ std::string val = gguf_get_arr_str(ctx_gguf, i, j);
640
+ // escape quotes
641
+ replace_all(val, "\\", "\\\\");
642
+ replace_all(val, "\"", "\\\"");
643
+ ss << '"' << val << '"';
644
+ } else if (arr_type == GGUF_TYPE_ARRAY) {
645
+ ss << "???";
646
+ } else {
647
+ ss << gguf_data_to_str(arr_type, data, j);
648
+ }
649
+ if (j < arr_n - 1) {
650
+ ss << ", ";
651
+ }
652
+ }
653
+ ss << "]";
654
+ return ss.str();
655
+ }
656
+ default:
657
+ return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
658
+ }
659
+ }
660
+
580
661
  //
581
662
  // ggml helpers
582
663
  //
@@ -1055,9 +1136,9 @@ enum e_model {
1055
1136
  MODEL_70B,
1056
1137
  };
1057
1138
 
1058
- static const size_t kB = 1024;
1059
- static const size_t MB = 1024*kB;
1060
- static const size_t GB = 1024*MB;
1139
+ static const size_t kiB = 1024;
1140
+ static const size_t MiB = 1024*kiB;
1141
+ static const size_t GiB = 1024*MiB;
1061
1142
 
1062
1143
  struct llama_hparams {
1063
1144
  bool vocab_only;
@@ -1194,6 +1275,7 @@ struct llama_kv_cache {
1194
1275
  // cannot be freely changed after a slot has been allocated.
1195
1276
  uint32_t head = 0;
1196
1277
  uint32_t size = 0;
1278
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
1197
1279
 
1198
1280
  // computed before each graph build
1199
1281
  uint32_t n = 0;
@@ -1248,6 +1330,9 @@ struct llama_vocab {
1248
1330
  id special_sep_id = -1;
1249
1331
  id special_pad_id = -1;
1250
1332
 
1333
+ int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1334
+ int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
1335
+
1251
1336
  id linefeed_id = 13;
1252
1337
  id special_prefix_id = 32007;
1253
1338
  id special_middle_id = 32009;
@@ -1292,6 +1377,9 @@ struct llama_model {
1292
1377
 
1293
1378
  int n_gpu_layers;
1294
1379
 
1380
+ // gguf metadata
1381
+ std::unordered_map<std::string, std::string> gguf_kv;
1382
+
1295
1383
  // context
1296
1384
  struct ggml_context * ctx = NULL;
1297
1385
 
@@ -1412,6 +1500,7 @@ static bool llama_kv_cache_init(
1412
1500
 
1413
1501
  cache.head = 0;
1414
1502
  cache.size = n_ctx;
1503
+ cache.used = 0;
1415
1504
 
1416
1505
  cache.cells.clear();
1417
1506
  cache.cells.resize(n_ctx);
@@ -1453,7 +1542,7 @@ static bool llama_kv_cache_init(
1453
1542
  vram_kv_cache += ggml_nbytes(cache.k);
1454
1543
  }
1455
1544
  if (vram_kv_cache > 0) {
1456
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1545
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1457
1546
  }
1458
1547
  }
1459
1548
  #endif
@@ -1513,6 +1602,8 @@ static bool llama_kv_cache_find_slot(
1513
1602
  }
1514
1603
  }
1515
1604
 
1605
+ cache.used += n_tokens;
1606
+
1516
1607
  return true;
1517
1608
  }
1518
1609
 
@@ -1533,6 +1624,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
1533
1624
  cache.cells[i].seq_id.clear();
1534
1625
  }
1535
1626
  cache.head = 0;
1627
+ cache.used = 0;
1536
1628
  }
1537
1629
 
1538
1630
  static void llama_kv_cache_seq_rm(
@@ -1555,6 +1647,9 @@ static void llama_kv_cache_seq_rm(
1555
1647
  continue;
1556
1648
  }
1557
1649
  if (cache.cells[i].seq_id.empty()) {
1650
+ // keep count of the number of used cells
1651
+ if (cache.cells[i].pos >= 0) cache.used--;
1652
+
1558
1653
  cache.cells[i].pos = -1;
1559
1654
  if (new_head == cache.size) new_head = i;
1560
1655
  }
@@ -1562,7 +1657,7 @@ static void llama_kv_cache_seq_rm(
1562
1657
  }
1563
1658
 
1564
1659
  // If we freed up a slot, set head to it so searching can start there.
1565
- if (new_head != cache.size) cache.head = new_head;
1660
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1566
1661
  }
1567
1662
 
1568
1663
  static void llama_kv_cache_seq_cp(
@@ -1588,6 +1683,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1588
1683
 
1589
1684
  for (uint32_t i = 0; i < cache.size; ++i) {
1590
1685
  if (!cache.cells[i].has_seq_id(seq_id)) {
1686
+ if (cache.cells[i].pos >= 0) cache.used--;
1591
1687
  cache.cells[i].pos = -1;
1592
1688
  cache.cells[i].seq_id.clear();
1593
1689
  if (new_head == cache.size) new_head = i;
@@ -1598,7 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1598
1694
  }
1599
1695
 
1600
1696
  // If we freed up a slot, set head to it so searching can start there.
1601
- if (new_head != cache.size) cache.head = new_head;
1697
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
1602
1698
  }
1603
1699
 
1604
1700
  static void llama_kv_cache_seq_shift(
@@ -1619,6 +1715,7 @@ static void llama_kv_cache_seq_shift(
1619
1715
  cache.cells[i].delta += delta;
1620
1716
 
1621
1717
  if (cache.cells[i].pos < 0) {
1718
+ if (!cache.cells[i].seq_id.empty()) cache.used--;
1622
1719
  cache.cells[i].pos = -1;
1623
1720
  cache.cells[i].seq_id.clear();
1624
1721
  if (new_head == cache.size) new_head = i;
@@ -1750,10 +1847,10 @@ struct llama_model_loader {
1750
1847
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
1751
1848
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
1752
1849
  default:
1753
- {
1754
- LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1755
- ftype = LLAMA_FTYPE_ALL_F32;
1756
- } break;
1850
+ {
1851
+ LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
1852
+ ftype = LLAMA_FTYPE_ALL_F32;
1853
+ } break;
1757
1854
  }
1758
1855
 
1759
1856
  // this is a way to mark that we have "guessed" the file type
@@ -1767,10 +1864,21 @@ struct llama_model_loader {
1767
1864
  }
1768
1865
 
1769
1866
  for (int i = 0; i < n_kv; i++) {
1770
- const char * name = gguf_get_key(ctx_gguf, i);
1771
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1867
+ const char * name = gguf_get_key(ctx_gguf, i);
1868
+ const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
1869
+ const std::string type_name =
1870
+ type == GGUF_TYPE_ARRAY
1871
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
1872
+ : gguf_type_name(type);
1873
+
1874
+ std::string value = gguf_kv_to_str(ctx_gguf, i);
1875
+ const size_t MAX_VALUE_LEN = 40;
1876
+ if (value.size() > MAX_VALUE_LEN) {
1877
+ value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
1878
+ }
1879
+ replace_all(value, "\n", "\\n");
1772
1880
 
1773
- LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
1881
+ LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
1774
1882
  }
1775
1883
 
1776
1884
  // print type counts
@@ -2065,6 +2173,17 @@ static void llm_load_hparams(
2065
2173
 
2066
2174
  auto & hparams = model.hparams;
2067
2175
 
2176
+ // get metadata as string
2177
+ for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
2178
+ enum gguf_type type = gguf_get_kv_type(ctx, i);
2179
+ if (type == GGUF_TYPE_ARRAY) {
2180
+ continue;
2181
+ }
2182
+ const char * name = gguf_get_key(ctx, i);
2183
+ const std::string value = gguf_kv_to_str(ctx, i);
2184
+ model.gguf_kv.emplace(name, value);
2185
+ }
2186
+
2068
2187
  // get general kv
2069
2188
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
2070
2189
 
@@ -2209,6 +2328,16 @@ static void llm_load_hparams(
2209
2328
  default: model.type = e_model::MODEL_UNKNOWN;
2210
2329
  }
2211
2330
  } break;
2331
+ case LLM_ARCH_STABLELM:
2332
+ {
2333
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2334
+
2335
+ switch (hparams.n_layer) {
2336
+ case 32: model.type = e_model::MODEL_3B; break;
2337
+ default: model.type = e_model::MODEL_UNKNOWN;
2338
+ }
2339
+ } break;
2340
+
2212
2341
  default: (void)0;
2213
2342
  }
2214
2343
 
@@ -2350,6 +2479,23 @@ static void llm_load_vocab(
2350
2479
  __func__, key.c_str(), id, old_id);
2351
2480
  id = old_id;
2352
2481
  }
2482
+
2483
+ }
2484
+
2485
+ // Handle add_bos_token and add_eos_token
2486
+ std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
2487
+ int kid = gguf_find_key(ctx, key.c_str());
2488
+ enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2489
+ vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2490
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2491
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2492
+ }
2493
+ key = kv(LLM_KV_TOKENIZER_ADD_EOS);
2494
+ kid = gguf_find_key(ctx, key.c_str());
2495
+ ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
2496
+ vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
2497
+ if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
2498
+ LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
2353
2499
  }
2354
2500
  }
2355
2501
 
@@ -2481,8 +2627,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2481
2627
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2482
2628
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2483
2629
  LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2484
- if (ml.n_bytes < GB) {
2485
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2630
+ if (ml.n_bytes < GiB) {
2631
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2486
2632
  } else {
2487
2633
  LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2488
2634
  }
@@ -2520,7 +2666,7 @@ static void llm_load_tensors(
2520
2666
 
2521
2667
  ml.calc_sizes(ctx_size, mmapped_size);
2522
2668
 
2523
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2669
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
2524
2670
 
2525
2671
  // create the ggml context
2526
2672
  {
@@ -2872,6 +3018,13 @@ static void llm_load_tensors(
2872
3018
  ggml_backend_type backend_output;
2873
3019
 
2874
3020
  if (n_gpu_layers > int(n_layer)) {
3021
+ #ifdef GGML_USE_CUBLAS
3022
+ if (n_gpu_layers > int(n_layer + 1)) {
3023
+ LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
3024
+ __func__, n_layer + 1);
3025
+ throw std::runtime_error("Persimmon CUDA offload failed");
3026
+ }
3027
+ #endif
2875
3028
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2876
3029
  // on Windows however this is detrimental unless everything is on the GPU
2877
3030
  #ifndef _WIN32
@@ -3073,6 +3226,81 @@ static void llm_load_tensors(
3073
3226
  }
3074
3227
  }
3075
3228
  } break;
3229
+ case LLM_ARCH_STABLELM:
3230
+ {
3231
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3232
+
3233
+ // output
3234
+ {
3235
+ ggml_backend_type backend_norm;
3236
+ ggml_backend_type backend_output;
3237
+
3238
+ if (n_gpu_layers > int(n_layer)) {
3239
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3240
+ // on Windows however this is detrimental unless everything is on the GPU
3241
+ #ifndef _WIN32
3242
+ backend_norm = llama_backend_offload;
3243
+ #else
3244
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
3245
+ #endif // _WIN32
3246
+
3247
+ backend_output = llama_backend_offload_split;
3248
+ } else {
3249
+ backend_norm = GGML_BACKEND_CPU;
3250
+ backend_output = GGML_BACKEND_CPU;
3251
+ }
3252
+
3253
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3254
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3255
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3256
+
3257
+ if (backend_norm == GGML_BACKEND_GPU) {
3258
+ vram_weights += ggml_nbytes(model.output_norm);
3259
+ }
3260
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3261
+ vram_weights += ggml_nbytes(model.output);
3262
+ }
3263
+ }
3264
+
3265
+ const uint32_t n_ff = hparams.n_ff;
3266
+
3267
+ const int i_gpu_start = n_layer - n_gpu_layers;
3268
+
3269
+ model.layers.resize(n_layer);
3270
+
3271
+ for (uint32_t i = 0; i < n_layer; ++i) {
3272
+ /*
3273
+ llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3274
+ */
3275
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3276
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3277
+
3278
+ auto & layer = model.layers[i];
3279
+
3280
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3281
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3282
+
3283
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3284
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3285
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3286
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3287
+
3288
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3289
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3290
+
3291
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3292
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3293
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3294
+
3295
+ if (backend == GGML_BACKEND_GPU) {
3296
+ vram_weights +=
3297
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3298
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3299
+ ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
3300
+ }
3301
+ }
3302
+ } break;
3303
+
3076
3304
  default:
3077
3305
  throw std::runtime_error("unknown architecture");
3078
3306
  }
@@ -3087,7 +3315,7 @@ static void llm_load_tensors(
3087
3315
  ctx_size +
3088
3316
  mmapped_size - vram_weights; // weights in VRAM not in memory
3089
3317
 
3090
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3318
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
3091
3319
 
3092
3320
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3093
3321
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3106,7 +3334,7 @@ static void llm_load_tensors(
3106
3334
  #endif // GGML_USE_CUBLAS
3107
3335
 
3108
3336
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3109
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3337
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3110
3338
  #else
3111
3339
  (void) n_gpu_layers;
3112
3340
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -3606,7 +3834,7 @@ struct llm_build_context {
3606
3834
  }
3607
3835
 
3608
3836
  struct ggml_cgraph * build_llama() {
3609
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3837
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3610
3838
 
3611
3839
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3612
3840
 
@@ -3718,7 +3946,7 @@ struct llm_build_context {
3718
3946
  }
3719
3947
 
3720
3948
  struct ggml_cgraph * build_baichuan() {
3721
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
3949
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3722
3950
 
3723
3951
  struct ggml_tensor * cur;
3724
3952
  struct ggml_tensor * inpL;
@@ -3838,7 +4066,7 @@ struct llm_build_context {
3838
4066
  }
3839
4067
 
3840
4068
  struct ggml_cgraph * build_falcon() {
3841
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4069
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3842
4070
 
3843
4071
  struct ggml_tensor * cur;
3844
4072
  struct ggml_tensor * inpL;
@@ -3960,7 +4188,7 @@ struct llm_build_context {
3960
4188
  }
3961
4189
 
3962
4190
  struct ggml_cgraph * build_starcoder() {
3963
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4191
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
3964
4192
 
3965
4193
  struct ggml_tensor * cur;
3966
4194
  struct ggml_tensor * pos;
@@ -4059,7 +4287,7 @@ struct llm_build_context {
4059
4287
  }
4060
4288
 
4061
4289
  struct ggml_cgraph * build_persimmon() {
4062
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4290
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4063
4291
 
4064
4292
  const int64_t n_rot = n_embd_head / 2;
4065
4293
 
@@ -4204,7 +4432,7 @@ struct llm_build_context {
4204
4432
  struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4205
4433
  cb(Kcur, "Kcur", il);
4206
4434
 
4207
- struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4435
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4208
4436
  cb(Q, "Q", il);
4209
4437
 
4210
4438
  Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4269,7 +4497,7 @@ struct llm_build_context {
4269
4497
  }
4270
4498
 
4271
4499
  struct ggml_cgraph * build_refact() {
4272
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4500
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4273
4501
 
4274
4502
  struct ggml_tensor * cur;
4275
4503
  struct ggml_tensor * inpL;
@@ -4360,7 +4588,7 @@ struct llm_build_context {
4360
4588
  }
4361
4589
 
4362
4590
  struct ggml_cgraph * build_bloom() {
4363
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4591
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4364
4592
 
4365
4593
  struct ggml_tensor * cur;
4366
4594
  struct ggml_tensor * inpL;
@@ -4454,7 +4682,7 @@ struct llm_build_context {
4454
4682
  }
4455
4683
 
4456
4684
  struct ggml_cgraph * build_mpt() {
4457
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4685
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4458
4686
 
4459
4687
  struct ggml_tensor * cur;
4460
4688
  struct ggml_tensor * inpL;
@@ -4551,6 +4779,119 @@ struct llm_build_context {
4551
4779
 
4552
4780
  return gf;
4553
4781
  }
4782
+
4783
+ struct ggml_cgraph * build_stablelm() {
4784
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
4785
+
4786
+ struct ggml_tensor * cur;
4787
+ struct ggml_tensor * inpL;
4788
+
4789
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4790
+ cb(inpL, "inp_embd", -1);
4791
+
4792
+ // inp_pos - contains the positions
4793
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4794
+ cb(inp_pos, "inp_pos", -1);
4795
+
4796
+ // KQ_scale
4797
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4798
+ cb(KQ_scale, "KQ_scale", -1);
4799
+
4800
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4801
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4802
+ cb(KQ_mask, "KQ_mask", -1);
4803
+
4804
+ // shift the entire K-cache if needed
4805
+ if (do_rope_shift) {
4806
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
4807
+ }
4808
+
4809
+ for (int il = 0; il < n_layer; ++il) {
4810
+ struct ggml_tensor * inpSA = inpL;
4811
+
4812
+ // norm
4813
+ cur = llm_build_norm(ctx0, inpL, hparams,
4814
+ model.layers[il].attn_norm,
4815
+ model.layers[il].attn_norm_b,
4816
+ LLM_NORM, cb, il);
4817
+ cb(cur, "attn_norm", il);
4818
+
4819
+ // self-attention
4820
+ {
4821
+ // compute Q and K and RoPE them
4822
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4823
+ cb(Qcur, "Qcur", il);
4824
+
4825
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4826
+ cb(Kcur, "Kcur", il);
4827
+
4828
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4829
+ cb(Vcur, "Vcur", il);
4830
+
4831
+ Qcur = ggml_rope_custom(
4832
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4833
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4834
+ ext_factor, attn_factor, beta_fast, beta_slow
4835
+ );
4836
+ cb(Qcur, "Qcur", il);
4837
+
4838
+ Kcur = ggml_rope_custom(
4839
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4840
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4841
+ ext_factor, attn_factor, beta_fast, beta_slow
4842
+ );
4843
+ cb(Kcur, "Kcur", il);
4844
+
4845
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4846
+
4847
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
4848
+ model.layers[il].wo, NULL,
4849
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
4850
+ cb(cur, "kqv_out", il);
4851
+ }
4852
+
4853
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4854
+ cb(ffn_inp, "ffn_inp", il);
4855
+
4856
+ // feed-forward network
4857
+ {
4858
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4859
+ model.layers[il].ffn_norm,
4860
+ model.layers[il].ffn_norm_b,
4861
+ LLM_NORM, cb, il);
4862
+ cb(cur, "ffn_norm", il);
4863
+
4864
+ cur = llm_build_ffn(ctx0, cur,
4865
+ model.layers[il].ffn_up, NULL,
4866
+ model.layers[il].ffn_gate, NULL,
4867
+ model.layers[il].ffn_down, NULL,
4868
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4869
+ cb(cur, "ffn_out", il);
4870
+ }
4871
+
4872
+ cur = ggml_add(ctx0, cur, ffn_inp);
4873
+ cb(cur, "l_out", il);
4874
+
4875
+ // input for next layer
4876
+ inpL = cur;
4877
+ }
4878
+
4879
+ cur = inpL;
4880
+
4881
+ cur = llm_build_norm(ctx0, cur, hparams,
4882
+ model.output_norm,
4883
+ model.output_norm_b,
4884
+ LLM_NORM, cb, -1);
4885
+ cb(cur, "result_norm", -1);
4886
+
4887
+ // lm_head
4888
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4889
+ cb(cur, "result_output", -1);
4890
+
4891
+ ggml_build_forward_expand(gf, cur);
4892
+
4893
+ return gf;
4894
+ }
4554
4895
  };
4555
4896
 
4556
4897
  //
@@ -5020,6 +5361,10 @@ static struct ggml_cgraph * llama_build_graph(
5020
5361
  {
5021
5362
  result = llm.build_mpt();
5022
5363
  } break;
5364
+ case LLM_ARCH_STABLELM:
5365
+ {
5366
+ result = llm.build_stablelm();
5367
+ } break;
5023
5368
  default:
5024
5369
  GGML_ASSERT(false);
5025
5370
  }
@@ -5129,6 +5474,12 @@ static int llama_decode_internal(
5129
5474
  batch.seq_id = seq_id_arr.data();
5130
5475
  }
5131
5476
 
5477
+ // if we have enough unused cells before the current head ->
5478
+ // better to start searching from the beginning of the cache, hoping to fill it
5479
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
5480
+ kv_self.head = 0;
5481
+ }
5482
+
5132
5483
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
5133
5484
  return 1;
5134
5485
  }
@@ -5139,7 +5490,7 @@ static int llama_decode_internal(
5139
5490
  //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
5140
5491
  kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
5141
5492
 
5142
- //printf("kv_self.n = %d\n", kv_self.n);
5493
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
5143
5494
 
5144
5495
  ggml_allocr_reset(lctx.alloc);
5145
5496
 
@@ -5195,7 +5546,8 @@ static int llama_decode_internal(
5195
5546
  model.arch == LLM_ARCH_FALCON ||
5196
5547
  model.arch == LLM_ARCH_REFACT ||
5197
5548
  model.arch == LLM_ARCH_MPT ||
5198
- model.arch == LLM_ARCH_STARCODER;
5549
+ model.arch == LLM_ARCH_STARCODER ||
5550
+ model.arch == LLM_ARCH_STABLELM;
5199
5551
 
5200
5552
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5201
5553
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -5987,7 +6339,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5987
6339
  // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
5988
6340
  // and passing 'add space prefix' as bool argument
5989
6341
  //
5990
- auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6342
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6343
+ if (&fragment == &fragment_buffer.front()) {
6344
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
6345
+ }
5991
6346
 
5992
6347
  #ifdef PRETOKENIZERDEBUG
5993
6348
  fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
@@ -7639,7 +7994,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7639
7994
  workers.clear();
7640
7995
  }
7641
7996
 
7642
- LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7997
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7643
7998
  int64_t tot_count = 0;
7644
7999
  for (size_t i = 0; i < hist_cur.size(); i++) {
7645
8000
  hist_all[i] += hist_cur[i];
@@ -8179,7 +8534,7 @@ struct llama_context * llama_new_context_with_model(
8179
8534
 
8180
8535
  {
8181
8536
  const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8182
- LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8537
+ LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
8183
8538
  }
8184
8539
 
8185
8540
  // resized during inference
@@ -8196,7 +8551,7 @@ struct llama_context * llama_new_context_with_model(
8196
8551
  {
8197
8552
  static const size_t tensor_alignment = 32;
8198
8553
  // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
8199
- ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
8554
+ ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
8200
8555
 
8201
8556
  // create measure allocator
8202
8557
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
@@ -8224,7 +8579,7 @@ struct llama_context * llama_new_context_with_model(
8224
8579
  // measure memory requirements for the graph
8225
8580
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
8226
8581
 
8227
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8582
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8228
8583
 
8229
8584
  // recreate allocator with exact memory requirements
8230
8585
  ggml_allocr_free(ctx->alloc);
@@ -8238,7 +8593,7 @@ struct llama_context * llama_new_context_with_model(
8238
8593
  #endif
8239
8594
  #ifdef GGML_USE_CUBLAS
8240
8595
  ggml_cuda_set_scratch_size(alloc_size);
8241
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8596
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
8242
8597
 
8243
8598
  // calculate total VRAM usage
8244
8599
  auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8258,10 +8613,10 @@ struct llama_context * llama_new_context_with_model(
8258
8613
  size_t ctx_vram_size = alloc_size + kv_vram_size;
8259
8614
  size_t total_vram_size = model_vram_size + ctx_vram_size;
8260
8615
 
8261
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8616
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
8262
8617
  total_vram_size / 1024.0 / 1024.0,
8263
8618
  model_vram_size / 1024.0 / 1024.0,
8264
- ctx_vram_size / 1024.0 / 1024.0);
8619
+ ctx_vram_size / 1024.0 / 1024.0);
8265
8620
  #endif
8266
8621
  }
8267
8622
 
@@ -8282,7 +8637,7 @@ struct llama_context * llama_new_context_with_model(
8282
8637
 
8283
8638
  const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
8284
8639
 
8285
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8640
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
8286
8641
 
8287
8642
  #define LLAMA_METAL_CHECK_BUF(result) \
8288
8643
  if (!(result)) { \
@@ -8348,6 +8703,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
8348
8703
  return model->hparams.rope_freq_scale_train;
8349
8704
  }
8350
8705
 
8706
+ int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
8707
+ const auto & it = model->gguf_kv.find(key);
8708
+ if (it == model->gguf_kv.end()) {
8709
+ if (buf_size > 0) {
8710
+ buf[0] = '\0';
8711
+ }
8712
+ return -1;
8713
+ }
8714
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8715
+ }
8716
+
8717
+ int llama_model_meta_count(const struct llama_model * model) {
8718
+ return (int)model->gguf_kv.size();
8719
+ }
8720
+
8721
+ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8722
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8723
+ if (buf_size > 0) {
8724
+ buf[0] = '\0';
8725
+ }
8726
+ return -1;
8727
+ }
8728
+ auto it = model->gguf_kv.begin();
8729
+ std::advance(it, i);
8730
+ return snprintf(buf, buf_size, "%s", it->first.c_str());
8731
+ }
8732
+
8733
+ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
8734
+ if (i < 0 || i >= (int)model->gguf_kv.size()) {
8735
+ if (buf_size > 0) {
8736
+ buf[0] = '\0';
8737
+ }
8738
+ return -1;
8739
+ }
8740
+ auto it = model->gguf_kv.begin();
8741
+ std::advance(it, i);
8742
+ return snprintf(buf, buf_size, "%s", it->second.c_str());
8743
+ }
8744
+
8351
8745
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
8352
8746
  return snprintf(buf, buf_size, "%s %s %s",
8353
8747
  llama_model_arch_name(model->arch).c_str(),
@@ -8406,8 +8800,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
8406
8800
  }
8407
8801
  }
8408
8802
 
8803
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
8804
+ struct llama_kv_cache_view result = {
8805
+ /*.n_cells = */ 0,
8806
+ /*.n_max_seq = */ n_max_seq,
8807
+ /*.token_count = */ 0,
8808
+ /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
8809
+ /*.max_contiguous = */ 0,
8810
+ /*.max_contiguous_idx = */ -1,
8811
+ /*.cells = */ nullptr,
8812
+ /*.cells_sequences = */ nullptr,
8813
+ };
8814
+ return result;
8815
+ }
8816
+
8817
+ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
8818
+ if (view->cells != nullptr) {
8819
+ free(view->cells);
8820
+ view->cells = nullptr;
8821
+ }
8822
+ if (view->cells_sequences != nullptr) {
8823
+ free(view->cells_sequences);
8824
+ view->cells_sequences = nullptr;
8825
+ }
8826
+ }
8827
+
8828
+ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
8829
+ if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
8830
+ view->n_cells = int32_t(ctx->kv_self.size);
8831
+ void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
8832
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
8833
+ view->cells = (struct llama_kv_cache_view_cell *)p;
8834
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
8835
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
8836
+ view->cells_sequences = (llama_seq_id *)p;
8837
+ }
8838
+
8839
+ const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
8840
+ llama_kv_cache_view_cell * c_curr = view->cells;
8841
+ llama_seq_id * cs_curr = view->cells_sequences;
8842
+ int32_t used_cells = 0;
8843
+ int32_t token_count = 0;
8844
+ int32_t curr_contig_idx = -1;
8845
+ uint32_t max_contig = 0;
8846
+ int32_t max_contig_idx = -1;
8847
+
8848
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
8849
+ const size_t curr_size = kv_cells[i].seq_id.size();
8850
+ token_count += curr_size;
8851
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
8852
+
8853
+ if (curr_size > 0) {
8854
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
8855
+ max_contig = i - curr_contig_idx;
8856
+ max_contig_idx = curr_contig_idx;
8857
+ }
8858
+ curr_contig_idx = -1;
8859
+ } else if (curr_contig_idx < 0) {
8860
+ curr_contig_idx = i;
8861
+ }
8862
+
8863
+ int seq_idx = 0;
8864
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
8865
+ if (seq_idx >= view->n_max_seq) {
8866
+ break;
8867
+ }
8868
+ cs_curr[seq_idx] = it;
8869
+ seq_idx++;
8870
+ }
8871
+ if (seq_idx != 0) {
8872
+ used_cells++;
8873
+ }
8874
+ for (; seq_idx < view->n_max_seq; seq_idx++) {
8875
+ cs_curr[seq_idx] = -1;
8876
+ }
8877
+ }
8878
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
8879
+ max_contig_idx = curr_contig_idx;
8880
+ max_contig = kv_cells.size() - curr_contig_idx;
8881
+ }
8882
+ view->max_contiguous = max_contig;
8883
+ view->max_contiguous_idx = max_contig_idx;
8884
+ view->token_count = token_count;
8885
+ view->used_cells = used_cells;
8886
+ if (uint32_t(used_cells) != ctx->kv_self.used) {
8887
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
8888
+ __func__, ctx->kv_self.used, used_cells);
8889
+ }
8890
+ }
8891
+
8409
8892
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
8410
- return ctx->kv_self.head;
8893
+ int result = 0;
8894
+
8895
+ for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
8896
+ result += ctx->kv_self.cells[i].seq_id.size();
8897
+ }
8898
+
8899
+ return result;
8900
+ }
8901
+
8902
+ int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
8903
+ return ctx->kv_self.used;
8411
8904
  }
8412
8905
 
8413
8906
  void llama_kv_cache_clear(struct llama_context * ctx) {
@@ -8577,16 +9070,18 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8577
9070
  const size_t kv_buf_size = kv_self.buf.size;
8578
9071
  const uint32_t kv_head = kv_self.head;
8579
9072
  const uint32_t kv_size = kv_self.size;
9073
+ const uint32_t kv_used = kv_self.used;
8580
9074
 
8581
9075
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8582
9076
  data_ctx->write(&kv_head, sizeof(kv_head));
8583
9077
  data_ctx->write(&kv_size, sizeof(kv_size));
9078
+ data_ctx->write(&kv_used, sizeof(kv_used));
8584
9079
 
8585
9080
  if (kv_buf_size) {
8586
9081
  const size_t elt_size = ggml_element_size(kv_self.k);
8587
9082
 
8588
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8589
- ggml_cgraph gf{};
9083
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9084
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8590
9085
 
8591
9086
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8592
9087
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -8604,9 +9099,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
8604
9099
  kv_head, n_embd, n_layer,
8605
9100
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8606
9101
 
8607
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
8608
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
8609
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9102
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
9103
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
9104
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8610
9105
 
8611
9106
  ggml_free(cpy_ctx);
8612
9107
 
@@ -8703,18 +9198,20 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8703
9198
  size_t kv_buf_size;
8704
9199
  uint32_t kv_head;
8705
9200
  uint32_t kv_size;
9201
+ uint32_t kv_used;
8706
9202
 
8707
9203
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
8708
9204
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
8709
9205
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
9206
+ memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
8710
9207
 
8711
9208
  if (kv_buf_size) {
8712
9209
  GGML_ASSERT(kv_self.buf.size == kv_buf_size);
8713
9210
 
8714
9211
  const size_t elt_size = ggml_element_size(kv_self.k);
8715
9212
 
8716
- ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
8717
- ggml_cgraph gf{};
9213
+ ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
9214
+ ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
8718
9215
 
8719
9216
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
8720
9217
  kin3d->data = (void *) inp;
@@ -8732,15 +9229,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
8732
9229
  kv_head, n_embd, n_layer,
8733
9230
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
8734
9231
 
8735
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
8736
- ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
8737
- ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
9232
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
9233
+ ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
9234
+ ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
8738
9235
 
8739
9236
  ggml_free(cpy_ctx);
8740
9237
  }
8741
9238
 
8742
9239
  ctx->kv_self.head = kv_head;
8743
9240
  ctx->kv_self.size = kv_size;
9241
+ ctx->kv_self.used = kv_used;
8744
9242
 
8745
9243
  ctx->kv_self.cells.resize(kv_size);
8746
9244
 
@@ -8989,6 +9487,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
8989
9487
  return model->vocab.linefeed_id;
8990
9488
  }
8991
9489
 
9490
+ int llama_add_bos_token(const struct llama_model * model) {
9491
+ return model->vocab.special_add_bos;
9492
+ }
9493
+
9494
+ int llama_add_eos_token(const struct llama_model * model) {
9495
+ return model->vocab.special_add_eos;
9496
+ }
9497
+
8992
9498
  llama_token llama_token_prefix(const struct llama_model * model) {
8993
9499
  return model->vocab.special_prefix_id;
8994
9500
  }