llama_cpp 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  #include <cstdio>
6
6
  #endif
7
7
 
8
- #include "llama_util.h"
8
+ #include "llama-util.h"
9
9
  #include "llama.h"
10
10
 
11
11
  #include "ggml.h"
@@ -27,11 +27,12 @@
27
27
  #include <thread>
28
28
  #include <atomic>
29
29
  #include <mutex>
30
+ #include <sstream>
31
+ #include <numeric>
30
32
 
31
33
  #define LLAMA_USE_SCRATCH
32
34
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
33
35
 
34
-
35
36
  // available llama models
36
37
  enum e_model {
37
38
  MODEL_UNKNOWN,
@@ -53,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
53
54
  { MODEL_7B, 512ull * MB },
54
55
  { MODEL_13B, 512ull * MB },
55
56
  { MODEL_30B, 512ull * MB },
56
- { MODEL_65B, 512ull * MB },
57
+ { MODEL_65B, 1024ull * MB },
57
58
  };
58
59
  return _MEM_REQ_SCRATCH0;
59
60
  }
@@ -64,10 +65,10 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
64
65
  { MODEL_7B, 512ull * MB },
65
66
  { MODEL_13B, 512ull * MB },
66
67
  { MODEL_30B, 512ull * MB },
67
- { MODEL_65B, 512ull * MB },
68
+ { MODEL_65B, 1024ull * MB },
68
69
  };
69
70
  return _MEM_REQ_SCRATCH1;
70
- };
71
+ }
71
72
 
72
73
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
73
74
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -79,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
79
80
  { MODEL_65B, 5120ull * MB },
80
81
  };
81
82
  return _MEM_REQ_KV_SELF;
82
- };
83
+ }
83
84
 
84
85
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
85
86
  // not actually needed if BLAS is disabled
@@ -92,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
92
93
  { MODEL_65B, 1536ull * MB },
93
94
  };
94
95
  return _MEM_REQ_EVAL;
95
- };
96
+ }
96
97
 
97
98
  // default hparams (LLaMA 7B)
98
99
  struct llama_hparams {
@@ -135,7 +136,7 @@ struct llama_kv_cache {
135
136
 
136
137
  struct ggml_context * ctx = NULL;
137
138
 
138
- llama_buffer buf;
139
+ llama_ctx_buffer buf;
139
140
 
140
141
  int n; // number of tokens currently in the cache
141
142
 
@@ -166,7 +167,7 @@ struct llama_model {
166
167
  struct llama_kv_cache kv_self;
167
168
 
168
169
  // the model memory buffer
169
- llama_buffer buf;
170
+ llama_ctx_buffer buf;
170
171
 
171
172
  // model memory mapped file
172
173
  std::unique_ptr<llama_mmap> mapping;
@@ -227,8 +228,8 @@ struct llama_context {
227
228
 
228
229
  // memory buffers used to evaluate the model
229
230
  // TODO: move in llama_state
230
- llama_buffer buf_compute;
231
- llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
231
+ llama_ctx_buffer buf_compute;
232
+ llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
232
233
 
233
234
  int buf_last = 0;
234
235
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -482,7 +483,9 @@ struct llama_file_loader {
482
483
  case GGML_TYPE_Q4_0:
483
484
  case GGML_TYPE_Q4_1:
484
485
  case GGML_TYPE_Q4_2:
485
- case GGML_TYPE_Q4_3:
486
+ case GGML_TYPE_Q5_0:
487
+ case GGML_TYPE_Q5_1:
488
+ case GGML_TYPE_Q8_0:
486
489
  break;
487
490
  default: {
488
491
  throw format("unrecognized tensor type %u\n", shard.type);
@@ -556,7 +559,9 @@ struct llama_file_saver {
556
559
  case GGML_TYPE_Q4_0:
557
560
  case GGML_TYPE_Q4_1:
558
561
  case GGML_TYPE_Q4_2:
559
- case GGML_TYPE_Q4_3:
562
+ case GGML_TYPE_Q5_0:
563
+ case GGML_TYPE_Q5_1:
564
+ case GGML_TYPE_Q8_0:
560
565
  break;
561
566
  default: LLAMA_ASSERT(false);
562
567
  }
@@ -654,6 +659,7 @@ struct llama_model_loader {
654
659
  LLAMA_ASSERT(lt.ne.size() == 1);
655
660
  tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
656
661
  }
662
+ ggml_set_name(tensor, lt.name.c_str());
657
663
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
658
664
  lt.ggml_tensor = tensor;
659
665
  num_ggml_tensors_created++;
@@ -722,8 +728,7 @@ struct llama_model_loader {
722
728
  LLAMA_ASSERT(offset == lt.size);
723
729
  } else if (lt.split_type == SPLIT_BY_COLUMNS) {
724
730
  // Let's load the data into temporary buffers to ensure the OS performs large loads.
725
- std::vector<llama_buffer> tmp_bufs;
726
- tmp_bufs.resize(lt.shards.size());
731
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
727
732
  for (size_t i = 0; i < lt.shards.size(); i++) {
728
733
  llama_load_tensor_shard & shard = lt.shards.at(i);
729
734
  llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -775,7 +780,7 @@ static bool kv_cache_init(
775
780
  const int n_embd = hparams.n_embd;
776
781
  const int n_layer = hparams.n_layer;
777
782
 
778
- const int64_t n_mem = (int64_t)n_layer*n_ctx;
783
+ const int64_t n_mem = n_layer*n_ctx;
779
784
  const int64_t n_elements = n_embd*n_mem;
780
785
 
781
786
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
@@ -794,6 +799,8 @@ static bool kv_cache_init(
794
799
 
795
800
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
796
801
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
802
+ ggml_set_name(cache.k, "cache_k");
803
+ ggml_set_name(cache.v, "cache_v");
797
804
 
798
805
  return true;
799
806
  }
@@ -802,7 +809,7 @@ struct llama_context_params llama_context_default_params() {
802
809
  struct llama_context_params result = {
803
810
  /*.n_ctx =*/ 512,
804
811
  /*.n_parts =*/ -1,
805
- /*.seed =*/ 0,
812
+ /*.seed =*/ -1,
806
813
  /*.f16_kv =*/ false,
807
814
  /*.logits_all =*/ false,
808
815
  /*.vocab_only =*/ false,
@@ -846,7 +853,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
846
853
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
847
854
  return "mostly Q4_1, some F16";
848
855
  case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
849
- case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
856
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
857
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
858
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
850
859
  default: return "unknown, may not work";
851
860
  }
852
861
  }
@@ -1075,9 +1084,10 @@ static bool llama_eval_internal(
1075
1084
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1076
1085
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1077
1086
  ggml_cgraph gf = {};
1078
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
1087
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1079
1088
 
1080
1089
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1090
+ ggml_set_name(embd, "embd");
1081
1091
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1082
1092
 
1083
1093
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1104,6 +1114,8 @@ static bool llama_eval_internal(
1104
1114
  // compute Q and K and RoPE them
1105
1115
  struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1106
1116
  struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1117
+ ggml_set_name(Qcur, "Qcur");
1118
+ ggml_set_name(Kcur, "Kcur");
1107
1119
 
1108
1120
  // store key and value to memory
1109
1121
  {
@@ -1124,6 +1136,7 @@ static bool llama_eval_internal(
1124
1136
  ggml_permute(ctx0,
1125
1137
  Qcur,
1126
1138
  0, 2, 1, 3);
1139
+ ggml_set_name(Q, "Q");
1127
1140
 
1128
1141
  struct ggml_tensor * K =
1129
1142
  ggml_permute(ctx0,
@@ -1131,21 +1144,26 @@ static bool llama_eval_internal(
1131
1144
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1132
1145
  n_embd/n_head, n_head, n_past + N),
1133
1146
  0, 2, 1, 3);
1147
+ ggml_set_name(K, "K");
1134
1148
 
1135
1149
  // K * Q
1136
1150
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1151
+ ggml_set_name(KQ, "KQ");
1137
1152
 
1138
1153
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
1139
- struct ggml_tensor * KQ_scaled =
1140
- ggml_scale(ctx0,
1141
- KQ,
1142
- ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
1154
+ struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1155
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1156
+
1157
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
1158
+ ggml_set_name(KQ_scaled, "KQ_scaled");
1143
1159
 
1144
1160
  // KQ_masked = mask_past(KQ_scaled)
1145
1161
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1162
+ ggml_set_name(KQ_masked, "KQ_masked");
1146
1163
 
1147
1164
  // KQ = soft_max(KQ_masked)
1148
1165
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1166
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
1149
1167
 
1150
1168
  // split cached V into n_head heads
1151
1169
  struct ggml_tensor * V =
@@ -1154,9 +1172,11 @@ static bool llama_eval_internal(
1154
1172
  n_ctx*ggml_element_size(kv_self.v),
1155
1173
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1156
1174
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1175
+ ggml_set_name(V, "V");
1157
1176
 
1158
1177
  #if 1
1159
1178
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1179
+ ggml_set_name(KQV, "KQV");
1160
1180
  #else
1161
1181
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1162
1182
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1167,11 +1187,13 @@ static bool llama_eval_internal(
1167
1187
 
1168
1188
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1169
1189
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1190
+ ggml_set_name(KQV_merged, "KQV_merged");
1170
1191
 
1171
1192
  // cur = KQV_merged.contiguous().view(n_embd, N)
1172
1193
  cur = ggml_cpy(ctx0,
1173
1194
  KQV_merged,
1174
1195
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1196
+ ggml_set_name(cur, "KQV_merged_contiguous");
1175
1197
 
1176
1198
  // projection (no bias)
1177
1199
  cur = ggml_mul_mat(ctx0,
@@ -1249,9 +1271,11 @@ static bool llama_eval_internal(
1249
1271
  ggml_build_forward_expand(&gf, inpL);
1250
1272
  ggml_graph_compute (ctx0, &gf);
1251
1273
 
1274
+ #ifdef GGML_PERF
1252
1275
  // print timing information per ggml operation (for debugging purposes)
1253
1276
  // requires GGML_PERF to be defined
1254
- //ggml_graph_print(&gf);
1277
+ ggml_graph_print(&gf);
1278
+ #endif
1255
1279
 
1256
1280
  // plot the computation graph in dot format (for debugging purposes)
1257
1281
  //if (n_past%100 == 0) {
@@ -1261,6 +1285,9 @@ static bool llama_eval_internal(
1261
1285
  //embd_w.resize(n_vocab*N);
1262
1286
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1263
1287
 
1288
+ // update kv token count
1289
+ lctx.model.kv_self.n = n_past + N;
1290
+
1264
1291
  // extract logits
1265
1292
  {
1266
1293
  auto & logits_out = lctx.logits;
@@ -1466,109 +1493,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1466
1493
  // sampling
1467
1494
  //
1468
1495
 
1469
- static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1470
- // find the top k tokens
1471
- std::partial_sort(
1472
- logits_id.begin(),
1473
- logits_id.begin() + top_k, logits_id.end(),
1474
- [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1475
- return a.first > b.first;
1476
- });
1496
+ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
1497
+ assert(candidates->size > 0);
1498
+
1499
+ const int64_t t_start_sample_us = ggml_time_us();
1500
+
1501
+ // Sort the logits in descending order
1502
+ if (!candidates->sorted) {
1503
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1504
+ return a.logit > b.logit;
1505
+ });
1506
+ candidates->sorted = true;
1507
+ }
1477
1508
 
1478
- logits_id.resize(top_k);
1509
+ float max_l = candidates->data[0].logit;
1510
+ float cum_sum = 0.0f;
1511
+ for (size_t i = 0; i < candidates->size; ++i) {
1512
+ float p = expf(candidates->data[i].logit - max_l);
1513
+ candidates->data[i].p = p;
1514
+ cum_sum += p;
1515
+ }
1516
+ for (size_t i = 0; i < candidates->size; ++i) {
1517
+ candidates->data[i].p /= cum_sum;
1518
+ }
1519
+
1520
+ if (ctx) {
1521
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1522
+ }
1479
1523
  }
1480
1524
 
1481
- static llama_vocab::id llama_sample_top_p_top_k(
1482
- llama_context & lctx,
1483
- const std::vector<llama_vocab::id> & last_n_tokens,
1484
- int top_k,
1485
- float top_p,
1486
- float temp,
1487
- float repeat_penalty) {
1488
- auto & rng = lctx.rng;
1489
-
1490
- const int n_logits = lctx.model.hparams.n_vocab;
1491
-
1492
- const auto & logits = lctx.logits;
1493
- const auto * plogits = logits.data() + logits.size() - n_logits;
1494
-
1495
- if (temp <= 0) {
1496
- // select the token with the highest logit directly
1497
- float max_logit = plogits[0];
1498
- llama_vocab::id max_id = 0;
1499
-
1500
- for (int i = 1; i < n_logits; ++i) {
1501
- if (plogits[i] > max_logit) {
1502
- max_logit = plogits[i];
1503
- max_id = i;
1504
- }
1525
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
1526
+ const int64_t t_start_sample_us = ggml_time_us();
1527
+
1528
+ k = std::max(k, (int) min_keep);
1529
+ k = std::min(k, (int) candidates->size);
1530
+
1531
+ // Sort scores in descending order
1532
+ if (!candidates->sorted) {
1533
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) {
1534
+ return a.logit > b.logit;
1535
+ };
1536
+ if (k == (int) candidates->size) {
1537
+ std::sort(candidates->data, candidates->data + candidates->size, comp);
1538
+ } else {
1539
+ std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
1505
1540
  }
1506
- return max_id;
1541
+ candidates->sorted = true;
1542
+ }
1543
+ candidates->size = k;
1544
+
1545
+ if (ctx) {
1546
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1507
1547
  }
1548
+ }
1508
1549
 
1509
- std::vector<std::pair<float, llama_vocab::id>> logits_id;
1510
- logits_id.reserve(n_logits);
1550
+ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1551
+ if (p >= 1.0f) {
1552
+ return;
1553
+ }
1511
1554
 
1512
- {
1513
- const float scale = 1.0f/temp;
1514
- for (int i = 0; i < n_logits; ++i) {
1515
- // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1516
- // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1517
- if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1518
- // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1519
- if (plogits[i] < 0.0f) {
1520
- logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1521
- } else {
1522
- logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1523
- }
1524
- } else {
1525
- logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1526
- }
1555
+ const int64_t t_start_sample_us = ggml_time_us();
1556
+
1557
+ llama_sample_softmax(ctx, candidates);
1558
+
1559
+ // Compute the cumulative probabilities
1560
+ float cum_sum = 0.0f;
1561
+ size_t last_idx = candidates->size;
1562
+
1563
+ for (size_t i = 0; i < candidates->size; ++i) {
1564
+ cum_sum += candidates->data[i].p;
1565
+
1566
+ // Check if the running sum is greater than p or if we have kept at least min_keep tokens
1567
+ if (cum_sum > p && i >= min_keep) {
1568
+ last_idx = i;
1569
+ break;
1527
1570
  }
1528
1571
  }
1529
1572
 
1530
- sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1573
+ // Resize the output vector to keep only the top-p tokens
1574
+ candidates->size = last_idx;
1531
1575
 
1532
- // compute probs for the top k tokens
1533
- std::vector<float> probs;
1534
- probs.reserve(logits_id.size());
1576
+ if (ctx) {
1577
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1578
+ }
1579
+ }
1535
1580
 
1536
- float maxl = logits_id[0].first;
1537
- double sum = 0.0;
1538
- for (const auto & kv : logits_id) {
1539
- const float p = expf(kv.first - maxl);
1540
- probs.push_back(p);
1541
- sum += p;
1581
+ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
1582
+ if (z >= 1.0f || candidates->size <= 2) {
1583
+ return;
1542
1584
  }
1543
1585
 
1544
- // normalize the probs
1545
- for (auto & p : probs) {
1546
- p /= sum;
1586
+ const int64_t t_start_sample_us = ggml_time_us();
1587
+
1588
+ llama_sample_softmax(nullptr, candidates);
1589
+
1590
+ // Compute the first and second derivatives
1591
+ std::vector<float> first_derivatives(candidates->size - 1);
1592
+ std::vector<float> second_derivatives(candidates->size - 2);
1593
+
1594
+ for (size_t i = 0; i < first_derivatives.size(); ++i) {
1595
+ first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
1596
+ }
1597
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1598
+ second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
1547
1599
  }
1548
1600
 
1549
- if (top_p < 1.0) {
1550
- double cumsum = 0.0;
1551
- for (int i = 0; i < (int) probs.size(); i++) {
1552
- cumsum += probs[i];
1553
- if (cumsum >= top_p) {
1554
- probs.resize(i + 1);
1555
- logits_id.resize(i + 1);
1556
- break;
1557
- }
1601
+ // Calculate absolute value of second derivatives
1602
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1603
+ second_derivatives[i] = abs(second_derivatives[i]);
1604
+ }
1605
+
1606
+ // Normalize the second derivatives
1607
+ float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1608
+ for (float & value : second_derivatives) {
1609
+ value /= second_derivatives_sum;
1610
+ }
1611
+
1612
+ float cum_sum = 0.0f;
1613
+ size_t last_idx = candidates->size;
1614
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1615
+ cum_sum += second_derivatives[i];
1616
+
1617
+ // Check if the running sum is greater than z or if we have kept at least min_keep tokens
1618
+ if (cum_sum > z && i >= min_keep) {
1619
+ last_idx = i;
1620
+ break;
1558
1621
  }
1559
1622
  }
1560
1623
 
1561
- //printf("\n");
1562
- //for (int i = 0; i < (int) 10; i++) {
1563
- // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1564
- //}
1565
- //printf("\n\n");
1566
- //exit(0);
1624
+ // Resize the output vector to keep only the tokens above the tail location
1625
+ candidates->size = last_idx;
1626
+
1627
+ if (ctx) {
1628
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1629
+ }
1630
+ }
1631
+
1632
+
1633
+ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1634
+ // Reference implementation:
1635
+ // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
1636
+ if (p >= 1.0f) {
1637
+ return;
1638
+ }
1639
+
1640
+ const int64_t t_start_sample_us = ggml_time_us();
1641
+
1642
+ // Compute the softmax of logits and calculate entropy
1643
+ llama_sample_softmax(nullptr, candidates);
1644
+
1645
+ float entropy = 0.0f;
1646
+ for (size_t i = 0; i < candidates->size; ++i) {
1647
+ entropy += -candidates->data[i].p * logf(candidates->data[i].p);
1648
+ }
1649
+
1650
+ // Compute the absolute difference between negative log probability and entropy for each candidate
1651
+ std::vector<float> shifted_scores;
1652
+ for (size_t i = 0; i < candidates->size; ++i) {
1653
+ float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
1654
+ shifted_scores.push_back(shifted_score);
1655
+ }
1656
+
1657
+ // Sort tokens based on the shifted_scores and their corresponding indices
1658
+ std::vector<size_t> indices(candidates->size);
1659
+ std::iota(indices.begin(), indices.end(), 0);
1660
+
1661
+ std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
1662
+ return shifted_scores[a] < shifted_scores[b];
1663
+ });
1664
+
1665
+ // Compute the cumulative probabilities
1666
+ float cum_sum = 0.0f;
1667
+ size_t last_idx = indices.size();
1668
+
1669
+ for (size_t i = 0; i < indices.size(); ++i) {
1670
+ size_t idx = indices[i];
1671
+ cum_sum += candidates->data[idx].p;
1672
+
1673
+ // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
1674
+ if (cum_sum > p && i >= min_keep - 1) {
1675
+ last_idx = i + 1;
1676
+ break;
1677
+ }
1678
+ }
1679
+
1680
+ // Resize the output vector to keep only the locally typical tokens
1681
+ std::vector<llama_token_data> new_candidates;
1682
+ for (size_t i = 0; i < last_idx; ++i) {
1683
+ size_t idx = indices[i];
1684
+ new_candidates.push_back(candidates->data[idx]);
1685
+ }
1686
+
1687
+ // Replace the data in candidates with the new_candidates data
1688
+ std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
1689
+ candidates->size = new_candidates.size();
1690
+
1691
+ if (ctx) {
1692
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1693
+ }
1694
+ }
1695
+
1696
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
1697
+ const int64_t t_start_sample_us = ggml_time_us();
1698
+
1699
+ for (size_t i = 0; i < candidates_p->size; ++i) {
1700
+ candidates_p->data[i].logit /= temp;
1701
+ }
1702
+
1703
+ if (ctx) {
1704
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1705
+ }
1706
+ }
1707
+
1708
+ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1709
+ if (last_tokens_size == 0 || penalty == 1.0f) {
1710
+ return;
1711
+ }
1712
+
1713
+ const int64_t t_start_sample_us = ggml_time_us();
1714
+
1715
+ for (size_t i = 0; i < candidates->size; ++i) {
1716
+ auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1717
+ if (token_iter == last_tokens + last_tokens_size) {
1718
+ continue;
1719
+ }
1720
+
1721
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1722
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1723
+ if (candidates->data[i].logit <= 0) {
1724
+ candidates->data[i].logit *= penalty;
1725
+ } else {
1726
+ candidates->data[i].logit /= penalty;
1727
+ }
1728
+ }
1729
+
1730
+ candidates->sorted = false;
1731
+
1732
+ if (ctx) {
1733
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1734
+ }
1735
+ }
1736
+
1737
+ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1738
+ if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1739
+ return;
1740
+ }
1741
+
1742
+ const int64_t t_start_sample_us = ggml_time_us();
1743
+
1744
+ // Create a frequency map to count occurrences of each token in last_tokens
1745
+ std::unordered_map<llama_token, int> token_count;
1746
+ for (size_t i = 0; i < last_tokens_size; ++i) {
1747
+ token_count[last_tokens_p[i]]++;
1748
+ }
1749
+
1750
+ // Apply frequency and presence penalties to the candidates
1751
+ for (size_t i = 0; i < candidates->size; ++i) {
1752
+ auto token_iter = token_count.find(candidates->data[i].id);
1753
+ if (token_iter == token_count.end()) {
1754
+ continue;
1755
+ }
1756
+
1757
+ int count = token_iter->second;
1758
+ candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
1759
+ }
1760
+
1761
+ candidates->sorted = false;
1762
+
1763
+ if (ctx) {
1764
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1765
+ }
1766
+ }
1767
+
1768
+
1769
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
1770
+ assert(ctx);
1771
+ auto N = float(llama_n_vocab(ctx));
1772
+ int64_t t_start_sample_us;
1773
+ t_start_sample_us = ggml_time_us();
1774
+
1775
+ llama_sample_softmax(nullptr, candidates);
1776
+
1777
+ // Estimate s_hat using the most probable m tokens
1778
+ float s_hat = 0.0;
1779
+ float sum_ti_bi = 0.0;
1780
+ float sum_ti_sq = 0.0;
1781
+ for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
1782
+ float t_i = logf(float(i + 2) / float(i + 1));
1783
+ float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
1784
+ sum_ti_bi += t_i * b_i;
1785
+ sum_ti_sq += t_i * t_i;
1786
+ }
1787
+ s_hat = sum_ti_bi / sum_ti_sq;
1788
+
1789
+ // Compute k from the estimated s_hat and target surprise value
1790
+ float epsilon_hat = s_hat - 1;
1791
+ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1792
+
1793
+ // Sample the next word X using top-k sampling
1794
+ llama_sample_top_k(nullptr, candidates, int(k));
1795
+ if (ctx) {
1796
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1797
+ }
1798
+ llama_token X = llama_sample_token(ctx, candidates);
1799
+ t_start_sample_us = ggml_time_us();
1800
+
1801
+ // Compute error as the difference between observed surprise and target surprise value
1802
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1803
+ return candidate.id == X;
1804
+ }));
1805
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1806
+ float e = observed_surprise - tau;
1807
+
1808
+ // Update mu using the learning rate and error
1809
+ *mu = *mu - eta * e;
1810
+
1811
+ if (ctx) {
1812
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1813
+ ctx->n_sample++;
1814
+ }
1815
+ return X;
1816
+ }
1817
+
1818
+ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
1819
+ assert(ctx);
1820
+ int64_t t_start_sample_us;
1821
+ t_start_sample_us = ggml_time_us();
1822
+
1823
+ llama_sample_softmax(ctx, candidates);
1824
+
1825
+ // Truncate the words with surprise values greater than mu
1826
+ candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1827
+ return -log2f(candidate.p) > *mu;
1828
+ }));
1829
+
1830
+ // Normalize the probabilities of the remaining words
1831
+ llama_sample_softmax(ctx, candidates);
1832
+
1833
+ // Sample the next word X from the remaining words
1834
+ if (ctx) {
1835
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1836
+ }
1837
+ llama_token X = llama_sample_token(ctx, candidates);
1838
+ t_start_sample_us = ggml_time_us();
1839
+
1840
+ // Compute error as the difference between observed surprise and target surprise value
1841
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1842
+ return candidate.id == X;
1843
+ }));
1844
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1845
+ float e = observed_surprise - tau;
1846
+
1847
+ // Update mu using the learning rate and error
1848
+ *mu = *mu - eta * e;
1849
+
1850
+ if (ctx) {
1851
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1852
+ }
1853
+ return X;
1854
+ }
1855
+
1856
+ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
1857
+ const int64_t t_start_sample_us = ggml_time_us();
1858
+
1859
+ // Find max element
1860
+ auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1861
+ return a.logit < b.logit;
1862
+ });
1863
+
1864
+ llama_token result = max_iter->id;
1865
+ if (ctx) {
1866
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1867
+ ctx->n_sample++;
1868
+ }
1869
+ return result;
1870
+ }
1871
+
1872
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
1873
+ assert(ctx);
1874
+ const int64_t t_start_sample_us = ggml_time_us();
1875
+ llama_sample_softmax(nullptr, candidates);
1876
+
1877
+ std::vector<float> probs;
1878
+ probs.reserve(candidates->size);
1879
+ for (size_t i = 0; i < candidates->size; ++i) {
1880
+ probs.push_back(candidates->data[i].p);
1881
+ }
1567
1882
 
1568
1883
  std::discrete_distribution<> dist(probs.begin(), probs.end());
1884
+ auto & rng = ctx->rng;
1569
1885
  int idx = dist(rng);
1570
1886
 
1571
- return logits_id[idx].second;
1887
+ llama_token result = candidates->data[idx].id;
1888
+
1889
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1890
+ ctx->n_sample++;
1891
+ return result;
1572
1892
  }
1573
1893
 
1574
1894
  //
@@ -1581,7 +1901,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1581
1901
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1582
1902
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1583
1903
  case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1584
- case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1904
+ case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1905
+ case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1906
+ case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
1585
1907
  default: throw format("invalid output file type %d\n", ftype);
1586
1908
  };
1587
1909
 
@@ -1618,8 +1940,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1618
1940
  // quantize only 2D tensors
1619
1941
  quantize &= (tensor.ne.size() == 2);
1620
1942
 
1621
- // GG: uncomment this to keep the output layer in FP16
1622
- //if (tensor.name.rfind("output")) {
1943
+ // uncomment this to keep the output layer in FP16
1944
+ //if (tensor.name == "output.weight") {
1623
1945
  // quantize = false;
1624
1946
  //}
1625
1947
 
@@ -1734,7 +2056,7 @@ struct llama_context * llama_init_from_file(
1734
2056
 
1735
2057
  llama_context * ctx = new llama_context;
1736
2058
 
1737
- if (params.seed <= 0) {
2059
+ if (params.seed < 0) {
1738
2060
  params.seed = time(NULL);
1739
2061
  }
1740
2062
 
@@ -1787,7 +2109,7 @@ struct llama_context * llama_init_from_file(
1787
2109
  if (params.logits_all) {
1788
2110
  ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1789
2111
  } else {
1790
- ctx->logits.reserve(hparams.n_ctx);
2112
+ ctx->logits.reserve(hparams.n_vocab);
1791
2113
  }
1792
2114
 
1793
2115
  if (params.embedding){
@@ -2069,31 +2391,330 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2069
2391
  }
2070
2392
  }
2071
2393
 
2072
- // Returns the KV cache that will contain the context for the
2073
- // ongoing prediction with the model.
2074
- const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
2075
- return ctx->model.kv_self.buf.addr;
2394
+ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2395
+ return ctx->model.kv_self.n;
2076
2396
  }
2077
2397
 
2078
- // Returns the size of the KV cache
2079
- size_t llama_get_kv_cache_size(struct llama_context * ctx) {
2080
- return ctx->model.kv_self.buf.size;
2398
+ #define LLAMA_MAX_RNG_STATE 64*1024
2399
+
2400
+ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2401
+ if (seed < 0) {
2402
+ seed = time(NULL);
2403
+ }
2404
+ ctx->rng.seed(seed);
2081
2405
  }
2082
2406
 
2083
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2084
- return ctx->model.kv_self.n;
2407
+ // Returns the *maximum* size of the state
2408
+ size_t llama_get_state_size(const struct llama_context * ctx) {
2409
+ // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2410
+ // for reference, std::mt19937(1337) serializes to 6701 bytes.
2411
+ const size_t s_rng_size = sizeof(size_t);
2412
+ const size_t s_rng = LLAMA_MAX_RNG_STATE;
2413
+ const size_t s_logits_capacity = sizeof(size_t);
2414
+ const size_t s_logits_size = sizeof(size_t);
2415
+ const size_t s_logits = ctx->logits.capacity() * sizeof(float);
2416
+ const size_t s_embedding_size = sizeof(size_t);
2417
+ const size_t s_embedding = ctx->embedding.size() * sizeof(float);
2418
+ const size_t s_kv_size = sizeof(size_t);
2419
+ const size_t s_kv_ntok = sizeof(int);
2420
+ const size_t s_kv = ctx->model.kv_self.buf.size;
2421
+
2422
+ const size_t s_total = (
2423
+ + s_rng_size
2424
+ + s_rng
2425
+ + s_logits_capacity
2426
+ + s_logits_size
2427
+ + s_logits
2428
+ + s_embedding_size
2429
+ + s_embedding
2430
+ + s_kv_size
2431
+ + s_kv_ntok
2432
+ + s_kv
2433
+ );
2434
+
2435
+ return s_total;
2085
2436
  }
2086
2437
 
2087
- // Sets the KV cache containing the current context for the model
2088
- void llama_set_kv_cache(
2089
- struct llama_context * ctx,
2090
- const uint8_t * kv_cache,
2091
- size_t n_size,
2092
- int n_token_count) {
2093
- // Make sure we have the same kv cache setup
2094
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
2095
- memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
2096
- ctx->model.kv_self.n = n_token_count;
2438
+ // Copies the state to the specified destination address
2439
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2440
+ uint8_t * out = dest;
2441
+
2442
+ // copy rng
2443
+ {
2444
+ std::stringstream rng_ss;
2445
+ rng_ss << ctx->rng;
2446
+
2447
+ const size_t rng_size = rng_ss.str().size();
2448
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2449
+
2450
+ memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
2451
+ memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
2452
+
2453
+ memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
2454
+ memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
2455
+ }
2456
+
2457
+ // copy logits
2458
+ {
2459
+ const size_t logits_cap = ctx->logits.capacity();
2460
+ const size_t logits_size = ctx->logits.size();
2461
+
2462
+ memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
2463
+ memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
2464
+
2465
+ if (logits_size) {
2466
+ memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
2467
+ }
2468
+
2469
+ out += logits_cap * sizeof(float);
2470
+ }
2471
+
2472
+ // copy embeddings
2473
+ {
2474
+ const size_t embedding_size = ctx->embedding.size();
2475
+
2476
+ memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
2477
+
2478
+ if (embedding_size) {
2479
+ memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
2480
+ out += embedding_size * sizeof(float);
2481
+ }
2482
+ }
2483
+
2484
+ // copy kv cache
2485
+ {
2486
+ const auto & kv_self = ctx->model.kv_self;
2487
+ const auto & hparams = ctx->model.hparams;
2488
+ const int n_layer = hparams.n_layer;
2489
+ const int n_embd = hparams.n_embd;
2490
+ const int n_ctx = hparams.n_ctx;
2491
+
2492
+ const size_t kv_size = kv_self.buf.size;
2493
+ const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2494
+
2495
+ memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2496
+ memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2497
+
2498
+ if (kv_size) {
2499
+ const size_t elt_size = ggml_element_size(kv_self.k);
2500
+ char buffer[4096];
2501
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2502
+ ggml_cgraph gf{};
2503
+ gf.n_threads = 1;
2504
+
2505
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2506
+ kout3d->data = out;
2507
+ out += ggml_nbytes(kout3d);
2508
+
2509
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2510
+ vout3d->data = out;
2511
+ out += ggml_nbytes(vout3d);
2512
+
2513
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2514
+ n_embd, kv_ntok, n_layer,
2515
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2516
+
2517
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2518
+ kv_ntok, n_embd, n_layer,
2519
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2520
+
2521
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2522
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2523
+ ggml_graph_compute(cpy_ctx, &gf);
2524
+ }
2525
+ }
2526
+
2527
+ const size_t written = out - dest;
2528
+ const size_t max_size = llama_get_state_size(ctx);
2529
+
2530
+ LLAMA_ASSERT(written <= max_size);
2531
+
2532
+ return written;
2533
+ }
2534
+
2535
+ // Sets the state reading from the specified source address
2536
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2537
+ const uint8_t * in = src;
2538
+
2539
+ // set rng
2540
+ {
2541
+ size_t rng_size;
2542
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2543
+
2544
+ memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2545
+ memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2546
+
2547
+ std::stringstream rng_ss;
2548
+ rng_ss.str(std::string(&rng_buf[0], rng_size));
2549
+ rng_ss >> ctx->rng;
2550
+
2551
+ LLAMA_ASSERT(rng_ss.fail() == false);
2552
+ }
2553
+
2554
+ // set logits
2555
+ {
2556
+ size_t logits_cap;
2557
+ size_t logits_size;
2558
+
2559
+ memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2560
+ memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2561
+
2562
+ LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2563
+
2564
+ if (logits_size) {
2565
+ ctx->logits.resize(logits_size);
2566
+ memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2567
+ }
2568
+
2569
+ in += logits_cap * sizeof(float);
2570
+ }
2571
+
2572
+ // set embeddings
2573
+ {
2574
+ size_t embedding_size;
2575
+
2576
+ memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2577
+
2578
+ LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2579
+
2580
+ if (embedding_size) {
2581
+ memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2582
+ in += embedding_size * sizeof(float);
2583
+ }
2584
+ }
2585
+
2586
+ // set kv cache
2587
+ {
2588
+ const auto & kv_self = ctx->model.kv_self;
2589
+ const auto & hparams = ctx->model.hparams;
2590
+ const int n_layer = hparams.n_layer;
2591
+ const int n_embd = hparams.n_embd;
2592
+ const int n_ctx = hparams.n_ctx;
2593
+
2594
+ size_t kv_size;
2595
+ int kv_ntok;
2596
+
2597
+ memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2598
+ memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2599
+
2600
+ if (kv_size) {
2601
+ LLAMA_ASSERT(kv_self.buf.size == kv_size);
2602
+
2603
+ const size_t elt_size = ggml_element_size(kv_self.k);
2604
+ char buffer[4096];
2605
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2606
+ ggml_cgraph gf{};
2607
+ gf.n_threads = 1;
2608
+
2609
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2610
+ kin3d->data = (void *) in;
2611
+ in += ggml_nbytes(kin3d);
2612
+
2613
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2614
+ vin3d->data = (void *) in;
2615
+ in += ggml_nbytes(vin3d);
2616
+
2617
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2618
+ n_embd, kv_ntok, n_layer,
2619
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2620
+
2621
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2622
+ kv_ntok, n_embd, n_layer,
2623
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2624
+
2625
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2626
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2627
+ ggml_graph_compute(cpy_ctx, &gf);
2628
+ }
2629
+
2630
+ ctx->model.kv_self.n = kv_ntok;
2631
+ }
2632
+
2633
+ const size_t nread = in - src;
2634
+ const size_t max_size = llama_get_state_size(ctx);
2635
+
2636
+ LLAMA_ASSERT(nread <= max_size);
2637
+
2638
+ return nread;
2639
+ }
2640
+
2641
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2642
+ llama_file file(path_session, "rb");
2643
+
2644
+ // sanity checks
2645
+ {
2646
+ const uint32_t magic = file.read_u32();
2647
+ const uint32_t version = file.read_u32();
2648
+
2649
+ if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2650
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2651
+ return false;
2652
+ }
2653
+
2654
+ llama_hparams session_hparams;
2655
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
2656
+
2657
+ if (session_hparams != ctx->model.hparams) {
2658
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2659
+ return false;
2660
+ }
2661
+ }
2662
+
2663
+ // load the prompt
2664
+ {
2665
+ const uint32_t n_token_count = file.read_u32();
2666
+
2667
+ if (n_token_count > n_token_capacity) {
2668
+ fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2669
+ return false;
2670
+ }
2671
+
2672
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2673
+ *n_token_count_out = n_token_count;
2674
+ }
2675
+
2676
+ // restore the context state
2677
+ {
2678
+ const size_t n_state_size_cur = file.size - file.tell();
2679
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2680
+
2681
+ if (n_state_size_cur > n_state_size_max) {
2682
+ fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
2683
+ return false;
2684
+ }
2685
+
2686
+ std::vector<uint8_t> state_data(n_state_size_max);
2687
+ file.read_raw(state_data.data(), n_state_size_cur);
2688
+
2689
+ llama_set_state_data(ctx, state_data.data());
2690
+ }
2691
+
2692
+ return true;
2693
+ }
2694
+
2695
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2696
+ llama_file file(path_session, "wb");
2697
+
2698
+ file.write_u32(LLAMA_SESSION_MAGIC);
2699
+ file.write_u32(LLAMA_SESSION_VERSION);
2700
+
2701
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2702
+
2703
+ // save the prompt
2704
+ file.write_u32((uint32_t) n_token_count);
2705
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2706
+
2707
+ // save the context state
2708
+ {
2709
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2710
+
2711
+ std::vector<uint8_t> state_data(n_state_size_max);
2712
+ const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
2713
+
2714
+ file.write_raw(state_data.data(), n_state_size_cur);
2715
+ }
2716
+
2717
+ return true;
2097
2718
  }
2098
2719
 
2099
2720
  int llama_eval(
@@ -2134,15 +2755,15 @@ int llama_tokenize(
2134
2755
  return res.size();
2135
2756
  }
2136
2757
 
2137
- int llama_n_vocab(struct llama_context * ctx) {
2758
+ int llama_n_vocab(const struct llama_context * ctx) {
2138
2759
  return ctx->vocab.id_to_token.size();
2139
2760
  }
2140
2761
 
2141
- int llama_n_ctx(struct llama_context * ctx) {
2762
+ int llama_n_ctx(const struct llama_context * ctx) {
2142
2763
  return ctx->model.hparams.n_ctx;
2143
2764
  }
2144
2765
 
2145
- int llama_n_embd(struct llama_context * ctx) {
2766
+ int llama_n_embd(const struct llama_context * ctx) {
2146
2767
  return ctx->model.hparams.n_embd;
2147
2768
  }
2148
2769
 
@@ -2154,7 +2775,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
2154
2775
  return ctx->embedding.data();
2155
2776
  }
2156
2777
 
2157
- const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
2778
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
2158
2779
  if (token >= llama_n_vocab(ctx)) {
2159
2780
  return nullptr;
2160
2781
  }
@@ -2170,33 +2791,8 @@ llama_token llama_token_eos() {
2170
2791
  return 2;
2171
2792
  }
2172
2793
 
2173
- llama_token llama_sample_top_p_top_k(
2174
- llama_context * ctx,
2175
- const llama_token * last_n_tokens_data,
2176
- int last_n_tokens_size,
2177
- int top_k,
2178
- float top_p,
2179
- float temp,
2180
- float repeat_penalty) {
2181
- const int64_t t_start_sample_us = ggml_time_us();
2182
-
2183
- llama_token result = 0;
2184
-
2185
- // TODO: avoid this ...
2186
- const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
2187
-
2188
- result = llama_sample_top_p_top_k(
2189
- *ctx,
2190
- last_n_tokens,
2191
- top_k,
2192
- top_p,
2193
- temp,
2194
- repeat_penalty);
2195
-
2196
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2197
- ctx->n_sample++;
2198
-
2199
- return result;
2794
+ llama_token llama_token_nl() {
2795
+ return 13;
2200
2796
  }
2201
2797
 
2202
2798