llama_cpp 0.0.7 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,7 +5,7 @@
5
5
  #include <cstdio>
6
6
  #endif
7
7
 
8
- #include "llama_util.h"
8
+ #include "llama-util.h"
9
9
  #include "llama.h"
10
10
 
11
11
  #include "ggml.h"
@@ -28,11 +28,11 @@
28
28
  #include <atomic>
29
29
  #include <mutex>
30
30
  #include <sstream>
31
+ #include <numeric>
31
32
 
32
33
  #define LLAMA_USE_SCRATCH
33
34
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
34
35
 
35
-
36
36
  // available llama models
37
37
  enum e_model {
38
38
  MODEL_UNKNOWN,
@@ -136,7 +136,7 @@ struct llama_kv_cache {
136
136
 
137
137
  struct ggml_context * ctx = NULL;
138
138
 
139
- llama_buffer buf;
139
+ llama_ctx_buffer buf;
140
140
 
141
141
  int n; // number of tokens currently in the cache
142
142
 
@@ -167,7 +167,7 @@ struct llama_model {
167
167
  struct llama_kv_cache kv_self;
168
168
 
169
169
  // the model memory buffer
170
- llama_buffer buf;
170
+ llama_ctx_buffer buf;
171
171
 
172
172
  // model memory mapped file
173
173
  std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +228,8 @@ struct llama_context {
228
228
 
229
229
  // memory buffers used to evaluate the model
230
230
  // TODO: move in llama_state
231
- llama_buffer buf_compute;
232
- llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
231
+ llama_ctx_buffer buf_compute;
232
+ llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
233
233
 
234
234
  int buf_last = 0;
235
235
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -483,7 +483,6 @@ struct llama_file_loader {
483
483
  case GGML_TYPE_Q4_0:
484
484
  case GGML_TYPE_Q4_1:
485
485
  case GGML_TYPE_Q4_2:
486
- case GGML_TYPE_Q4_3:
487
486
  case GGML_TYPE_Q5_0:
488
487
  case GGML_TYPE_Q5_1:
489
488
  case GGML_TYPE_Q8_0:
@@ -560,7 +559,6 @@ struct llama_file_saver {
560
559
  case GGML_TYPE_Q4_0:
561
560
  case GGML_TYPE_Q4_1:
562
561
  case GGML_TYPE_Q4_2:
563
- case GGML_TYPE_Q4_3:
564
562
  case GGML_TYPE_Q5_0:
565
563
  case GGML_TYPE_Q5_1:
566
564
  case GGML_TYPE_Q8_0:
@@ -661,6 +659,7 @@ struct llama_model_loader {
661
659
  LLAMA_ASSERT(lt.ne.size() == 1);
662
660
  tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
663
661
  }
662
+ ggml_set_name(tensor, lt.name.c_str());
664
663
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
665
664
  lt.ggml_tensor = tensor;
666
665
  num_ggml_tensors_created++;
@@ -729,8 +728,7 @@ struct llama_model_loader {
729
728
  LLAMA_ASSERT(offset == lt.size);
730
729
  } else if (lt.split_type == SPLIT_BY_COLUMNS) {
731
730
  // Let's load the data into temporary buffers to ensure the OS performs large loads.
732
- std::vector<llama_buffer> tmp_bufs;
733
- tmp_bufs.resize(lt.shards.size());
731
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
734
732
  for (size_t i = 0; i < lt.shards.size(); i++) {
735
733
  llama_load_tensor_shard & shard = lt.shards.at(i);
736
734
  llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -782,7 +780,7 @@ static bool kv_cache_init(
782
780
  const int n_embd = hparams.n_embd;
783
781
  const int n_layer = hparams.n_layer;
784
782
 
785
- const int64_t n_mem = (int64_t)n_layer*n_ctx;
783
+ const int64_t n_mem = n_layer*n_ctx;
786
784
  const int64_t n_elements = n_embd*n_mem;
787
785
 
788
786
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
@@ -801,6 +799,8 @@ static bool kv_cache_init(
801
799
 
802
800
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
803
801
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
802
+ ggml_set_name(cache.k, "cache_k");
803
+ ggml_set_name(cache.v, "cache_v");
804
804
 
805
805
  return true;
806
806
  }
@@ -809,7 +809,7 @@ struct llama_context_params llama_context_default_params() {
809
809
  struct llama_context_params result = {
810
810
  /*.n_ctx =*/ 512,
811
811
  /*.n_parts =*/ -1,
812
- /*.seed =*/ 0,
812
+ /*.seed =*/ -1,
813
813
  /*.f16_kv =*/ false,
814
814
  /*.logits_all =*/ false,
815
815
  /*.vocab_only =*/ false,
@@ -853,7 +853,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
853
853
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
854
  return "mostly Q4_1, some F16";
855
855
  case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
- case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
856
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
857
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
858
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -1088,6 +1087,7 @@ static bool llama_eval_internal(
1088
1087
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1089
1088
 
1090
1089
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1090
+ ggml_set_name(embd, "embd");
1091
1091
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1092
1092
 
1093
1093
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1114,6 +1114,8 @@ static bool llama_eval_internal(
1114
1114
  // compute Q and K and RoPE them
1115
1115
  struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
1116
  struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1117
+ ggml_set_name(Qcur, "Qcur");
1118
+ ggml_set_name(Kcur, "Kcur");
1117
1119
 
1118
1120
  // store key and value to memory
1119
1121
  {
@@ -1134,6 +1136,7 @@ static bool llama_eval_internal(
1134
1136
  ggml_permute(ctx0,
1135
1137
  Qcur,
1136
1138
  0, 2, 1, 3);
1139
+ ggml_set_name(Q, "Q");
1137
1140
 
1138
1141
  struct ggml_tensor * K =
1139
1142
  ggml_permute(ctx0,
@@ -1141,21 +1144,26 @@ static bool llama_eval_internal(
1141
1144
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1142
1145
  n_embd/n_head, n_head, n_past + N),
1143
1146
  0, 2, 1, 3);
1147
+ ggml_set_name(K, "K");
1144
1148
 
1145
1149
  // K * Q
1146
1150
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1151
+ ggml_set_name(KQ, "KQ");
1147
1152
 
1148
1153
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
1149
- struct ggml_tensor * KQ_scaled =
1150
- ggml_scale(ctx0,
1151
- KQ,
1152
- ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
1154
+ struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1155
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1156
+
1157
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
1158
+ ggml_set_name(KQ_scaled, "KQ_scaled");
1153
1159
 
1154
1160
  // KQ_masked = mask_past(KQ_scaled)
1155
1161
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1162
+ ggml_set_name(KQ_masked, "KQ_masked");
1156
1163
 
1157
1164
  // KQ = soft_max(KQ_masked)
1158
1165
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1166
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
1159
1167
 
1160
1168
  // split cached V into n_head heads
1161
1169
  struct ggml_tensor * V =
@@ -1164,9 +1172,11 @@ static bool llama_eval_internal(
1164
1172
  n_ctx*ggml_element_size(kv_self.v),
1165
1173
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1166
1174
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1175
+ ggml_set_name(V, "V");
1167
1176
 
1168
1177
  #if 1
1169
1178
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1179
+ ggml_set_name(KQV, "KQV");
1170
1180
  #else
1171
1181
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1172
1182
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1177,11 +1187,13 @@ static bool llama_eval_internal(
1177
1187
 
1178
1188
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1179
1189
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1190
+ ggml_set_name(KQV_merged, "KQV_merged");
1180
1191
 
1181
1192
  // cur = KQV_merged.contiguous().view(n_embd, N)
1182
1193
  cur = ggml_cpy(ctx0,
1183
1194
  KQV_merged,
1184
1195
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1196
+ ggml_set_name(cur, "KQV_merged_contiguous");
1185
1197
 
1186
1198
  // projection (no bias)
1187
1199
  cur = ggml_mul_mat(ctx0,
@@ -1273,6 +1285,9 @@ static bool llama_eval_internal(
1273
1285
  //embd_w.resize(n_vocab*N);
1274
1286
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1275
1287
 
1288
+ // update kv token count
1289
+ lctx.model.kv_self.n = n_past + N;
1290
+
1276
1291
  // extract logits
1277
1292
  {
1278
1293
  auto & logits_out = lctx.logits;
@@ -1478,109 +1493,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1478
1493
  // sampling
1479
1494
  //
1480
1495
 
1481
- static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1482
- // find the top k tokens
1483
- std::partial_sort(
1484
- logits_id.begin(),
1485
- logits_id.begin() + top_k, logits_id.end(),
1486
- [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1487
- return a.first > b.first;
1488
- });
1496
+ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
1497
+ assert(candidates->size > 0);
1498
+
1499
+ const int64_t t_start_sample_us = ggml_time_us();
1500
+
1501
+ // Sort the logits in descending order
1502
+ if (!candidates->sorted) {
1503
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1504
+ return a.logit > b.logit;
1505
+ });
1506
+ candidates->sorted = true;
1507
+ }
1508
+
1509
+ float max_l = candidates->data[0].logit;
1510
+ float cum_sum = 0.0f;
1511
+ for (size_t i = 0; i < candidates->size; ++i) {
1512
+ float p = expf(candidates->data[i].logit - max_l);
1513
+ candidates->data[i].p = p;
1514
+ cum_sum += p;
1515
+ }
1516
+ for (size_t i = 0; i < candidates->size; ++i) {
1517
+ candidates->data[i].p /= cum_sum;
1518
+ }
1489
1519
 
1490
- logits_id.resize(top_k);
1520
+ if (ctx) {
1521
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1522
+ }
1491
1523
  }
1492
1524
 
1493
- static llama_vocab::id llama_sample_top_p_top_k(
1494
- llama_context & lctx,
1495
- const std::vector<llama_vocab::id> & last_n_tokens,
1496
- int top_k,
1497
- float top_p,
1498
- float temp,
1499
- float repeat_penalty) {
1500
- auto & rng = lctx.rng;
1501
-
1502
- const int n_logits = lctx.model.hparams.n_vocab;
1503
-
1504
- const auto & logits = lctx.logits;
1505
- const auto * plogits = logits.data() + logits.size() - n_logits;
1506
-
1507
- if (temp <= 0) {
1508
- // select the token with the highest logit directly
1509
- float max_logit = plogits[0];
1510
- llama_vocab::id max_id = 0;
1511
-
1512
- for (int i = 1; i < n_logits; ++i) {
1513
- if (plogits[i] > max_logit) {
1514
- max_logit = plogits[i];
1515
- max_id = i;
1516
- }
1525
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
1526
+ const int64_t t_start_sample_us = ggml_time_us();
1527
+
1528
+ k = std::max(k, (int) min_keep);
1529
+ k = std::min(k, (int) candidates->size);
1530
+
1531
+ // Sort scores in descending order
1532
+ if (!candidates->sorted) {
1533
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) {
1534
+ return a.logit > b.logit;
1535
+ };
1536
+ if (k == (int) candidates->size) {
1537
+ std::sort(candidates->data, candidates->data + candidates->size, comp);
1538
+ } else {
1539
+ std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
1517
1540
  }
1518
- return max_id;
1541
+ candidates->sorted = true;
1542
+ }
1543
+ candidates->size = k;
1544
+
1545
+ if (ctx) {
1546
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1519
1547
  }
1548
+ }
1520
1549
 
1521
- std::vector<std::pair<float, llama_vocab::id>> logits_id;
1522
- logits_id.reserve(n_logits);
1550
+ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1551
+ if (p >= 1.0f) {
1552
+ return;
1553
+ }
1523
1554
 
1524
- {
1525
- const float scale = 1.0f/temp;
1526
- for (int i = 0; i < n_logits; ++i) {
1527
- // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1528
- // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1529
- if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1530
- // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1531
- if (plogits[i] < 0.0f) {
1532
- logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1533
- } else {
1534
- logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1535
- }
1536
- } else {
1537
- logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1538
- }
1555
+ const int64_t t_start_sample_us = ggml_time_us();
1556
+
1557
+ llama_sample_softmax(ctx, candidates);
1558
+
1559
+ // Compute the cumulative probabilities
1560
+ float cum_sum = 0.0f;
1561
+ size_t last_idx = candidates->size;
1562
+
1563
+ for (size_t i = 0; i < candidates->size; ++i) {
1564
+ cum_sum += candidates->data[i].p;
1565
+
1566
+ // Check if the running sum is greater than p or if we have kept at least min_keep tokens
1567
+ if (cum_sum > p && i >= min_keep) {
1568
+ last_idx = i;
1569
+ break;
1539
1570
  }
1540
1571
  }
1541
1572
 
1542
- sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1573
+ // Resize the output vector to keep only the top-p tokens
1574
+ candidates->size = last_idx;
1543
1575
 
1544
- // compute probs for the top k tokens
1545
- std::vector<float> probs;
1546
- probs.reserve(logits_id.size());
1576
+ if (ctx) {
1577
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1578
+ }
1579
+ }
1547
1580
 
1548
- float maxl = logits_id[0].first;
1549
- double sum = 0.0;
1550
- for (const auto & kv : logits_id) {
1551
- const float p = expf(kv.first - maxl);
1552
- probs.push_back(p);
1553
- sum += p;
1581
+ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
1582
+ if (z >= 1.0f || candidates->size <= 2) {
1583
+ return;
1554
1584
  }
1555
1585
 
1556
- // normalize the probs
1557
- for (auto & p : probs) {
1558
- p /= sum;
1586
+ const int64_t t_start_sample_us = ggml_time_us();
1587
+
1588
+ llama_sample_softmax(nullptr, candidates);
1589
+
1590
+ // Compute the first and second derivatives
1591
+ std::vector<float> first_derivatives(candidates->size - 1);
1592
+ std::vector<float> second_derivatives(candidates->size - 2);
1593
+
1594
+ for (size_t i = 0; i < first_derivatives.size(); ++i) {
1595
+ first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
1596
+ }
1597
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1598
+ second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
1559
1599
  }
1560
1600
 
1561
- if (top_p < 1.0) {
1562
- double cumsum = 0.0;
1563
- for (int i = 0; i < (int) probs.size(); i++) {
1564
- cumsum += probs[i];
1565
- if (cumsum >= top_p) {
1566
- probs.resize(i + 1);
1567
- logits_id.resize(i + 1);
1568
- break;
1569
- }
1601
+ // Calculate absolute value of second derivatives
1602
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1603
+ second_derivatives[i] = abs(second_derivatives[i]);
1604
+ }
1605
+
1606
+ // Normalize the second derivatives
1607
+ float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1608
+ for (float & value : second_derivatives) {
1609
+ value /= second_derivatives_sum;
1610
+ }
1611
+
1612
+ float cum_sum = 0.0f;
1613
+ size_t last_idx = candidates->size;
1614
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1615
+ cum_sum += second_derivatives[i];
1616
+
1617
+ // Check if the running sum is greater than z or if we have kept at least min_keep tokens
1618
+ if (cum_sum > z && i >= min_keep) {
1619
+ last_idx = i;
1620
+ break;
1570
1621
  }
1571
1622
  }
1572
1623
 
1573
- //printf("\n");
1574
- //for (int i = 0; i < (int) 10; i++) {
1575
- // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1576
- //}
1577
- //printf("\n\n");
1578
- //exit(0);
1624
+ // Resize the output vector to keep only the tokens above the tail location
1625
+ candidates->size = last_idx;
1626
+
1627
+ if (ctx) {
1628
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1629
+ }
1630
+ }
1631
+
1632
+
1633
+ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1634
+ // Reference implementation:
1635
+ // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
1636
+ if (p >= 1.0f) {
1637
+ return;
1638
+ }
1639
+
1640
+ const int64_t t_start_sample_us = ggml_time_us();
1641
+
1642
+ // Compute the softmax of logits and calculate entropy
1643
+ llama_sample_softmax(nullptr, candidates);
1644
+
1645
+ float entropy = 0.0f;
1646
+ for (size_t i = 0; i < candidates->size; ++i) {
1647
+ entropy += -candidates->data[i].p * logf(candidates->data[i].p);
1648
+ }
1649
+
1650
+ // Compute the absolute difference between negative log probability and entropy for each candidate
1651
+ std::vector<float> shifted_scores;
1652
+ for (size_t i = 0; i < candidates->size; ++i) {
1653
+ float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
1654
+ shifted_scores.push_back(shifted_score);
1655
+ }
1656
+
1657
+ // Sort tokens based on the shifted_scores and their corresponding indices
1658
+ std::vector<size_t> indices(candidates->size);
1659
+ std::iota(indices.begin(), indices.end(), 0);
1660
+
1661
+ std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
1662
+ return shifted_scores[a] < shifted_scores[b];
1663
+ });
1664
+
1665
+ // Compute the cumulative probabilities
1666
+ float cum_sum = 0.0f;
1667
+ size_t last_idx = indices.size();
1668
+
1669
+ for (size_t i = 0; i < indices.size(); ++i) {
1670
+ size_t idx = indices[i];
1671
+ cum_sum += candidates->data[idx].p;
1672
+
1673
+ // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
1674
+ if (cum_sum > p && i >= min_keep - 1) {
1675
+ last_idx = i + 1;
1676
+ break;
1677
+ }
1678
+ }
1679
+
1680
+ // Resize the output vector to keep only the locally typical tokens
1681
+ std::vector<llama_token_data> new_candidates;
1682
+ for (size_t i = 0; i < last_idx; ++i) {
1683
+ size_t idx = indices[i];
1684
+ new_candidates.push_back(candidates->data[idx]);
1685
+ }
1686
+
1687
+ // Replace the data in candidates with the new_candidates data
1688
+ std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
1689
+ candidates->size = new_candidates.size();
1690
+
1691
+ if (ctx) {
1692
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1693
+ }
1694
+ }
1695
+
1696
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
1697
+ const int64_t t_start_sample_us = ggml_time_us();
1698
+
1699
+ for (size_t i = 0; i < candidates_p->size; ++i) {
1700
+ candidates_p->data[i].logit /= temp;
1701
+ }
1702
+
1703
+ if (ctx) {
1704
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1705
+ }
1706
+ }
1707
+
1708
+ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1709
+ if (last_tokens_size == 0 || penalty == 1.0f) {
1710
+ return;
1711
+ }
1712
+
1713
+ const int64_t t_start_sample_us = ggml_time_us();
1714
+
1715
+ for (size_t i = 0; i < candidates->size; ++i) {
1716
+ auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1717
+ if (token_iter == last_tokens + last_tokens_size) {
1718
+ continue;
1719
+ }
1720
+
1721
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1722
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1723
+ if (candidates->data[i].logit <= 0) {
1724
+ candidates->data[i].logit *= penalty;
1725
+ } else {
1726
+ candidates->data[i].logit /= penalty;
1727
+ }
1728
+ }
1729
+
1730
+ candidates->sorted = false;
1731
+
1732
+ if (ctx) {
1733
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1734
+ }
1735
+ }
1736
+
1737
+ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1738
+ if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1739
+ return;
1740
+ }
1741
+
1742
+ const int64_t t_start_sample_us = ggml_time_us();
1743
+
1744
+ // Create a frequency map to count occurrences of each token in last_tokens
1745
+ std::unordered_map<llama_token, int> token_count;
1746
+ for (size_t i = 0; i < last_tokens_size; ++i) {
1747
+ token_count[last_tokens_p[i]]++;
1748
+ }
1749
+
1750
+ // Apply frequency and presence penalties to the candidates
1751
+ for (size_t i = 0; i < candidates->size; ++i) {
1752
+ auto token_iter = token_count.find(candidates->data[i].id);
1753
+ if (token_iter == token_count.end()) {
1754
+ continue;
1755
+ }
1756
+
1757
+ int count = token_iter->second;
1758
+ candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
1759
+ }
1760
+
1761
+ candidates->sorted = false;
1762
+
1763
+ if (ctx) {
1764
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1765
+ }
1766
+ }
1767
+
1768
+
1769
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
1770
+ assert(ctx);
1771
+ auto N = float(llama_n_vocab(ctx));
1772
+ int64_t t_start_sample_us;
1773
+ t_start_sample_us = ggml_time_us();
1774
+
1775
+ llama_sample_softmax(nullptr, candidates);
1776
+
1777
+ // Estimate s_hat using the most probable m tokens
1778
+ float s_hat = 0.0;
1779
+ float sum_ti_bi = 0.0;
1780
+ float sum_ti_sq = 0.0;
1781
+ for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
1782
+ float t_i = logf(float(i + 2) / float(i + 1));
1783
+ float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
1784
+ sum_ti_bi += t_i * b_i;
1785
+ sum_ti_sq += t_i * t_i;
1786
+ }
1787
+ s_hat = sum_ti_bi / sum_ti_sq;
1788
+
1789
+ // Compute k from the estimated s_hat and target surprise value
1790
+ float epsilon_hat = s_hat - 1;
1791
+ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1792
+
1793
+ // Sample the next word X using top-k sampling
1794
+ llama_sample_top_k(nullptr, candidates, int(k));
1795
+ if (ctx) {
1796
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1797
+ }
1798
+ llama_token X = llama_sample_token(ctx, candidates);
1799
+ t_start_sample_us = ggml_time_us();
1800
+
1801
+ // Compute error as the difference between observed surprise and target surprise value
1802
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1803
+ return candidate.id == X;
1804
+ }));
1805
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1806
+ float e = observed_surprise - tau;
1807
+
1808
+ // Update mu using the learning rate and error
1809
+ *mu = *mu - eta * e;
1810
+
1811
+ if (ctx) {
1812
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1813
+ ctx->n_sample++;
1814
+ }
1815
+ return X;
1816
+ }
1817
+
1818
+ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
1819
+ assert(ctx);
1820
+ int64_t t_start_sample_us;
1821
+ t_start_sample_us = ggml_time_us();
1822
+
1823
+ llama_sample_softmax(ctx, candidates);
1824
+
1825
+ // Truncate the words with surprise values greater than mu
1826
+ candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1827
+ return -log2f(candidate.p) > *mu;
1828
+ }));
1829
+
1830
+ // Normalize the probabilities of the remaining words
1831
+ llama_sample_softmax(ctx, candidates);
1832
+
1833
+ // Sample the next word X from the remaining words
1834
+ if (ctx) {
1835
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1836
+ }
1837
+ llama_token X = llama_sample_token(ctx, candidates);
1838
+ t_start_sample_us = ggml_time_us();
1839
+
1840
+ // Compute error as the difference between observed surprise and target surprise value
1841
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1842
+ return candidate.id == X;
1843
+ }));
1844
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1845
+ float e = observed_surprise - tau;
1846
+
1847
+ // Update mu using the learning rate and error
1848
+ *mu = *mu - eta * e;
1849
+
1850
+ if (ctx) {
1851
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1852
+ }
1853
+ return X;
1854
+ }
1855
+
1856
+ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
1857
+ const int64_t t_start_sample_us = ggml_time_us();
1858
+
1859
+ // Find max element
1860
+ auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1861
+ return a.logit < b.logit;
1862
+ });
1863
+
1864
+ llama_token result = max_iter->id;
1865
+ if (ctx) {
1866
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1867
+ ctx->n_sample++;
1868
+ }
1869
+ return result;
1870
+ }
1871
+
1872
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
1873
+ assert(ctx);
1874
+ const int64_t t_start_sample_us = ggml_time_us();
1875
+ llama_sample_softmax(nullptr, candidates);
1876
+
1877
+ std::vector<float> probs;
1878
+ probs.reserve(candidates->size);
1879
+ for (size_t i = 0; i < candidates->size; ++i) {
1880
+ probs.push_back(candidates->data[i].p);
1881
+ }
1579
1882
 
1580
1883
  std::discrete_distribution<> dist(probs.begin(), probs.end());
1884
+ auto & rng = ctx->rng;
1581
1885
  int idx = dist(rng);
1582
1886
 
1583
- return logits_id[idx].second;
1887
+ llama_token result = candidates->data[idx].id;
1888
+
1889
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1890
+ ctx->n_sample++;
1891
+ return result;
1584
1892
  }
1585
1893
 
1586
1894
  //
@@ -1593,7 +1901,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1593
1901
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1594
1902
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1595
1903
  case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1596
- case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
1904
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
1905
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
1906
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1749,7 +2056,7 @@ struct llama_context * llama_init_from_file(
1749
2056
 
1750
2057
  llama_context * ctx = new llama_context;
1751
2058
 
1752
- if (params.seed <= 0) {
2059
+ if (params.seed < 0) {
1753
2060
  params.seed = time(NULL);
1754
2061
  }
1755
2062
 
@@ -2084,21 +2391,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2084
2391
  }
2085
2392
  }
2086
2393
 
2087
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2394
+ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2088
2395
  return ctx->model.kv_self.n;
2089
2396
  }
2090
2397
 
2091
2398
  #define LLAMA_MAX_RNG_STATE 64*1024
2092
2399
 
2093
2400
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
- if (seed <= 0) {
2401
+ if (seed < 0) {
2095
2402
  seed = time(NULL);
2096
2403
  }
2097
2404
  ctx->rng.seed(seed);
2098
2405
  }
2099
2406
 
2100
- // Returns the size of the state
2101
- size_t llama_get_state_size(struct llama_context * ctx) {
2407
+ // Returns the *maximum* size of the state
2408
+ size_t llama_get_state_size(const struct llama_context * ctx) {
2102
2409
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
2410
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
2411
  const size_t s_rng_size = sizeof(size_t);
@@ -2176,21 +2483,51 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2176
2483
 
2177
2484
  // copy kv cache
2178
2485
  {
2179
- const size_t kv_size = ctx->model.kv_self.buf.size;
2486
+ const auto & kv_self = ctx->model.kv_self;
2487
+ const auto & hparams = ctx->model.hparams;
2488
+ const int n_layer = hparams.n_layer;
2489
+ const int n_embd = hparams.n_embd;
2490
+ const int n_ctx = hparams.n_ctx;
2491
+
2492
+ const size_t kv_size = kv_self.buf.size;
2180
2493
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
2494
 
2182
2495
  memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
2496
  memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
2497
 
2185
2498
  if (kv_size) {
2186
- memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2499
+ const size_t elt_size = ggml_element_size(kv_self.k);
2500
+ char buffer[4096];
2501
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2502
+ ggml_cgraph gf{};
2503
+ gf.n_threads = 1;
2504
+
2505
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2506
+ kout3d->data = out;
2507
+ out += ggml_nbytes(kout3d);
2508
+
2509
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2510
+ vout3d->data = out;
2511
+ out += ggml_nbytes(vout3d);
2512
+
2513
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2514
+ n_embd, kv_ntok, n_layer,
2515
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2516
+
2517
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2518
+ kv_ntok, n_embd, n_layer,
2519
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2520
+
2521
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2522
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2523
+ ggml_graph_compute(cpy_ctx, &gf);
2187
2524
  }
2188
2525
  }
2189
2526
 
2190
2527
  const size_t written = out - dest;
2191
- const size_t expected = llama_get_state_size(ctx);
2528
+ const size_t max_size = llama_get_state_size(ctx);
2192
2529
 
2193
- LLAMA_ASSERT(written == expected);
2530
+ LLAMA_ASSERT(written <= max_size);
2194
2531
 
2195
2532
  return written;
2196
2533
  }
@@ -2248,6 +2585,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2248
2585
 
2249
2586
  // set kv cache
2250
2587
  {
2588
+ const auto & kv_self = ctx->model.kv_self;
2589
+ const auto & hparams = ctx->model.hparams;
2590
+ const int n_layer = hparams.n_layer;
2591
+ const int n_embd = hparams.n_embd;
2592
+ const int n_ctx = hparams.n_ctx;
2593
+
2251
2594
  size_t kv_size;
2252
2595
  int kv_ntok;
2253
2596
 
@@ -2255,29 +2598,125 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2255
2598
  memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2256
2599
 
2257
2600
  if (kv_size) {
2258
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2601
+ LLAMA_ASSERT(kv_self.buf.size == kv_size);
2602
+
2603
+ const size_t elt_size = ggml_element_size(kv_self.k);
2604
+ char buffer[4096];
2605
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2606
+ ggml_cgraph gf{};
2607
+ gf.n_threads = 1;
2608
+
2609
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2610
+ kin3d->data = (void *) in;
2611
+ in += ggml_nbytes(kin3d);
2259
2612
 
2260
- void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
- void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2613
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2614
+ vin3d->data = (void *) in;
2615
+ in += ggml_nbytes(vin3d);
2262
2616
 
2263
- memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2617
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2618
+ n_embd, kv_ntok, n_layer,
2619
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2264
2620
 
2265
- ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
- ctx->model.kv_self.v->data = v_data;
2621
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2622
+ kv_ntok, n_embd, n_layer,
2623
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2267
2624
 
2625
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2626
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2627
+ ggml_graph_compute(cpy_ctx, &gf);
2268
2628
  }
2269
2629
 
2270
2630
  ctx->model.kv_self.n = kv_ntok;
2271
2631
  }
2272
2632
 
2273
2633
  const size_t nread = in - src;
2274
- const size_t expected = llama_get_state_size(ctx);
2634
+ const size_t max_size = llama_get_state_size(ctx);
2275
2635
 
2276
- LLAMA_ASSERT(nread == expected);
2636
+ LLAMA_ASSERT(nread <= max_size);
2277
2637
 
2278
2638
  return nread;
2279
2639
  }
2280
2640
 
2641
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2642
+ llama_file file(path_session, "rb");
2643
+
2644
+ // sanity checks
2645
+ {
2646
+ const uint32_t magic = file.read_u32();
2647
+ const uint32_t version = file.read_u32();
2648
+
2649
+ if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2650
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2651
+ return false;
2652
+ }
2653
+
2654
+ llama_hparams session_hparams;
2655
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
2656
+
2657
+ if (session_hparams != ctx->model.hparams) {
2658
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2659
+ return false;
2660
+ }
2661
+ }
2662
+
2663
+ // load the prompt
2664
+ {
2665
+ const uint32_t n_token_count = file.read_u32();
2666
+
2667
+ if (n_token_count > n_token_capacity) {
2668
+ fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2669
+ return false;
2670
+ }
2671
+
2672
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2673
+ *n_token_count_out = n_token_count;
2674
+ }
2675
+
2676
+ // restore the context state
2677
+ {
2678
+ const size_t n_state_size_cur = file.size - file.tell();
2679
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2680
+
2681
+ if (n_state_size_cur > n_state_size_max) {
2682
+ fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
2683
+ return false;
2684
+ }
2685
+
2686
+ std::vector<uint8_t> state_data(n_state_size_max);
2687
+ file.read_raw(state_data.data(), n_state_size_cur);
2688
+
2689
+ llama_set_state_data(ctx, state_data.data());
2690
+ }
2691
+
2692
+ return true;
2693
+ }
2694
+
2695
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2696
+ llama_file file(path_session, "wb");
2697
+
2698
+ file.write_u32(LLAMA_SESSION_MAGIC);
2699
+ file.write_u32(LLAMA_SESSION_VERSION);
2700
+
2701
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2702
+
2703
+ // save the prompt
2704
+ file.write_u32((uint32_t) n_token_count);
2705
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2706
+
2707
+ // save the context state
2708
+ {
2709
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2710
+
2711
+ std::vector<uint8_t> state_data(n_state_size_max);
2712
+ const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
2713
+
2714
+ file.write_raw(state_data.data(), n_state_size_cur);
2715
+ }
2716
+
2717
+ return true;
2718
+ }
2719
+
2281
2720
  int llama_eval(
2282
2721
  struct llama_context * ctx,
2283
2722
  const llama_token * tokens,
@@ -2316,15 +2755,15 @@ int llama_tokenize(
2316
2755
  return res.size();
2317
2756
  }
2318
2757
 
2319
- int llama_n_vocab(struct llama_context * ctx) {
2758
+ int llama_n_vocab(const struct llama_context * ctx) {
2320
2759
  return ctx->vocab.id_to_token.size();
2321
2760
  }
2322
2761
 
2323
- int llama_n_ctx(struct llama_context * ctx) {
2762
+ int llama_n_ctx(const struct llama_context * ctx) {
2324
2763
  return ctx->model.hparams.n_ctx;
2325
2764
  }
2326
2765
 
2327
- int llama_n_embd(struct llama_context * ctx) {
2766
+ int llama_n_embd(const struct llama_context * ctx) {
2328
2767
  return ctx->model.hparams.n_embd;
2329
2768
  }
2330
2769
 
@@ -2336,7 +2775,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
2336
2775
  return ctx->embedding.data();
2337
2776
  }
2338
2777
 
2339
- const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
2778
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
2340
2779
  if (token >= llama_n_vocab(ctx)) {
2341
2780
  return nullptr;
2342
2781
  }
@@ -2352,33 +2791,8 @@ llama_token llama_token_eos() {
2352
2791
  return 2;
2353
2792
  }
2354
2793
 
2355
- llama_token llama_sample_top_p_top_k(
2356
- llama_context * ctx,
2357
- const llama_token * last_n_tokens_data,
2358
- int last_n_tokens_size,
2359
- int top_k,
2360
- float top_p,
2361
- float temp,
2362
- float repeat_penalty) {
2363
- const int64_t t_start_sample_us = ggml_time_us();
2364
-
2365
- llama_token result = 0;
2366
-
2367
- // TODO: avoid this ...
2368
- const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
2369
-
2370
- result = llama_sample_top_p_top_k(
2371
- *ctx,
2372
- last_n_tokens,
2373
- top_k,
2374
- top_p,
2375
- temp,
2376
- repeat_penalty);
2377
-
2378
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2379
- ctx->n_sample++;
2380
-
2381
- return result;
2794
+ llama_token llama_token_nl() {
2795
+ return 13;
2382
2796
  }
2383
2797
 
2384
2798
 
@@ -2430,4 +2844,3 @@ const char * llama_print_system_info(void) {
2430
2844
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2431
2845
  return ctx->model.tensors_by_name;
2432
2846
  }
2433
-