llama_cpp 0.0.7 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  #include <cstdio>
6
6
  #endif
7
7
 
8
- #include "llama_util.h"
8
+ #include "llama-util.h"
9
9
  #include "llama.h"
10
10
 
11
11
  #include "ggml.h"
@@ -28,11 +28,11 @@
28
28
  #include <atomic>
29
29
  #include <mutex>
30
30
  #include <sstream>
31
+ #include <numeric>
31
32
 
32
33
  #define LLAMA_USE_SCRATCH
33
34
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
34
35
 
35
-
36
36
  // available llama models
37
37
  enum e_model {
38
38
  MODEL_UNKNOWN,
@@ -136,7 +136,7 @@ struct llama_kv_cache {
136
136
 
137
137
  struct ggml_context * ctx = NULL;
138
138
 
139
- llama_buffer buf;
139
+ llama_ctx_buffer buf;
140
140
 
141
141
  int n; // number of tokens currently in the cache
142
142
 
@@ -167,7 +167,7 @@ struct llama_model {
167
167
  struct llama_kv_cache kv_self;
168
168
 
169
169
  // the model memory buffer
170
- llama_buffer buf;
170
+ llama_ctx_buffer buf;
171
171
 
172
172
  // model memory mapped file
173
173
  std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +228,8 @@ struct llama_context {
228
228
 
229
229
  // memory buffers used to evaluate the model
230
230
  // TODO: move in llama_state
231
- llama_buffer buf_compute;
232
- llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
231
+ llama_ctx_buffer buf_compute;
232
+ llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
233
233
 
234
234
  int buf_last = 0;
235
235
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -483,7 +483,6 @@ struct llama_file_loader {
483
483
  case GGML_TYPE_Q4_0:
484
484
  case GGML_TYPE_Q4_1:
485
485
  case GGML_TYPE_Q4_2:
486
- case GGML_TYPE_Q4_3:
487
486
  case GGML_TYPE_Q5_0:
488
487
  case GGML_TYPE_Q5_1:
489
488
  case GGML_TYPE_Q8_0:
@@ -560,7 +559,6 @@ struct llama_file_saver {
560
559
  case GGML_TYPE_Q4_0:
561
560
  case GGML_TYPE_Q4_1:
562
561
  case GGML_TYPE_Q4_2:
563
- case GGML_TYPE_Q4_3:
564
562
  case GGML_TYPE_Q5_0:
565
563
  case GGML_TYPE_Q5_1:
566
564
  case GGML_TYPE_Q8_0:
@@ -661,6 +659,7 @@ struct llama_model_loader {
661
659
  LLAMA_ASSERT(lt.ne.size() == 1);
662
660
  tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
663
661
  }
662
+ ggml_set_name(tensor, lt.name.c_str());
664
663
  LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
665
664
  lt.ggml_tensor = tensor;
666
665
  num_ggml_tensors_created++;
@@ -729,8 +728,7 @@ struct llama_model_loader {
729
728
  LLAMA_ASSERT(offset == lt.size);
730
729
  } else if (lt.split_type == SPLIT_BY_COLUMNS) {
731
730
  // Let's load the data into temporary buffers to ensure the OS performs large loads.
732
- std::vector<llama_buffer> tmp_bufs;
733
- tmp_bufs.resize(lt.shards.size());
731
+ std::vector<llama_buffer> tmp_bufs(lt.shards.size());
734
732
  for (size_t i = 0; i < lt.shards.size(); i++) {
735
733
  llama_load_tensor_shard & shard = lt.shards.at(i);
736
734
  llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -782,7 +780,7 @@ static bool kv_cache_init(
782
780
  const int n_embd = hparams.n_embd;
783
781
  const int n_layer = hparams.n_layer;
784
782
 
785
- const int64_t n_mem = (int64_t)n_layer*n_ctx;
783
+ const int64_t n_mem = n_layer*n_ctx;
786
784
  const int64_t n_elements = n_embd*n_mem;
787
785
 
788
786
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
@@ -801,6 +799,8 @@ static bool kv_cache_init(
801
799
 
802
800
  cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
803
801
  cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
802
+ ggml_set_name(cache.k, "cache_k");
803
+ ggml_set_name(cache.v, "cache_v");
804
804
 
805
805
  return true;
806
806
  }
@@ -809,7 +809,7 @@ struct llama_context_params llama_context_default_params() {
809
809
  struct llama_context_params result = {
810
810
  /*.n_ctx =*/ 512,
811
811
  /*.n_parts =*/ -1,
812
- /*.seed =*/ 0,
812
+ /*.seed =*/ -1,
813
813
  /*.f16_kv =*/ false,
814
814
  /*.logits_all =*/ false,
815
815
  /*.vocab_only =*/ false,
@@ -853,7 +853,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
853
853
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
854
854
  return "mostly Q4_1, some F16";
855
855
  case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
856
- case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
857
856
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
858
857
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
859
858
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -1088,6 +1087,7 @@ static bool llama_eval_internal(
1088
1087
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1089
1088
 
1090
1089
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1090
+ ggml_set_name(embd, "embd");
1091
1091
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
1092
1092
 
1093
1093
  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1114,6 +1114,8 @@ static bool llama_eval_internal(
1114
1114
  // compute Q and K and RoPE them
1115
1115
  struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1116
1116
  struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
1117
+ ggml_set_name(Qcur, "Qcur");
1118
+ ggml_set_name(Kcur, "Kcur");
1117
1119
 
1118
1120
  // store key and value to memory
1119
1121
  {
@@ -1134,6 +1136,7 @@ static bool llama_eval_internal(
1134
1136
  ggml_permute(ctx0,
1135
1137
  Qcur,
1136
1138
  0, 2, 1, 3);
1139
+ ggml_set_name(Q, "Q");
1137
1140
 
1138
1141
  struct ggml_tensor * K =
1139
1142
  ggml_permute(ctx0,
@@ -1141,21 +1144,26 @@ static bool llama_eval_internal(
1141
1144
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1142
1145
  n_embd/n_head, n_head, n_past + N),
1143
1146
  0, 2, 1, 3);
1147
+ ggml_set_name(K, "K");
1144
1148
 
1145
1149
  // K * Q
1146
1150
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1151
+ ggml_set_name(KQ, "KQ");
1147
1152
 
1148
1153
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
1149
- struct ggml_tensor * KQ_scaled =
1150
- ggml_scale(ctx0,
1151
- KQ,
1152
- ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
1154
+ struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1155
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1156
+
1157
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
1158
+ ggml_set_name(KQ_scaled, "KQ_scaled");
1153
1159
 
1154
1160
  // KQ_masked = mask_past(KQ_scaled)
1155
1161
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1162
+ ggml_set_name(KQ_masked, "KQ_masked");
1156
1163
 
1157
1164
  // KQ = soft_max(KQ_masked)
1158
1165
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1166
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
1159
1167
 
1160
1168
  // split cached V into n_head heads
1161
1169
  struct ggml_tensor * V =
@@ -1164,9 +1172,11 @@ static bool llama_eval_internal(
1164
1172
  n_ctx*ggml_element_size(kv_self.v),
1165
1173
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1166
1174
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1175
+ ggml_set_name(V, "V");
1167
1176
 
1168
1177
  #if 1
1169
1178
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1179
+ ggml_set_name(KQV, "KQV");
1170
1180
  #else
1171
1181
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
1172
1182
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1177,11 +1187,13 @@ static bool llama_eval_internal(
1177
1187
 
1178
1188
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1179
1189
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1190
+ ggml_set_name(KQV_merged, "KQV_merged");
1180
1191
 
1181
1192
  // cur = KQV_merged.contiguous().view(n_embd, N)
1182
1193
  cur = ggml_cpy(ctx0,
1183
1194
  KQV_merged,
1184
1195
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1196
+ ggml_set_name(cur, "KQV_merged_contiguous");
1185
1197
 
1186
1198
  // projection (no bias)
1187
1199
  cur = ggml_mul_mat(ctx0,
@@ -1273,6 +1285,9 @@ static bool llama_eval_internal(
1273
1285
  //embd_w.resize(n_vocab*N);
1274
1286
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1275
1287
 
1288
+ // update kv token count
1289
+ lctx.model.kv_self.n = n_past + N;
1290
+
1276
1291
  // extract logits
1277
1292
  {
1278
1293
  auto & logits_out = lctx.logits;
@@ -1478,109 +1493,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1478
1493
  // sampling
1479
1494
  //
1480
1495
 
1481
- static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1482
- // find the top k tokens
1483
- std::partial_sort(
1484
- logits_id.begin(),
1485
- logits_id.begin() + top_k, logits_id.end(),
1486
- [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1487
- return a.first > b.first;
1488
- });
1496
+ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
1497
+ assert(candidates->size > 0);
1498
+
1499
+ const int64_t t_start_sample_us = ggml_time_us();
1500
+
1501
+ // Sort the logits in descending order
1502
+ if (!candidates->sorted) {
1503
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1504
+ return a.logit > b.logit;
1505
+ });
1506
+ candidates->sorted = true;
1507
+ }
1508
+
1509
+ float max_l = candidates->data[0].logit;
1510
+ float cum_sum = 0.0f;
1511
+ for (size_t i = 0; i < candidates->size; ++i) {
1512
+ float p = expf(candidates->data[i].logit - max_l);
1513
+ candidates->data[i].p = p;
1514
+ cum_sum += p;
1515
+ }
1516
+ for (size_t i = 0; i < candidates->size; ++i) {
1517
+ candidates->data[i].p /= cum_sum;
1518
+ }
1489
1519
 
1490
- logits_id.resize(top_k);
1520
+ if (ctx) {
1521
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1522
+ }
1491
1523
  }
1492
1524
 
1493
- static llama_vocab::id llama_sample_top_p_top_k(
1494
- llama_context & lctx,
1495
- const std::vector<llama_vocab::id> & last_n_tokens,
1496
- int top_k,
1497
- float top_p,
1498
- float temp,
1499
- float repeat_penalty) {
1500
- auto & rng = lctx.rng;
1501
-
1502
- const int n_logits = lctx.model.hparams.n_vocab;
1503
-
1504
- const auto & logits = lctx.logits;
1505
- const auto * plogits = logits.data() + logits.size() - n_logits;
1506
-
1507
- if (temp <= 0) {
1508
- // select the token with the highest logit directly
1509
- float max_logit = plogits[0];
1510
- llama_vocab::id max_id = 0;
1511
-
1512
- for (int i = 1; i < n_logits; ++i) {
1513
- if (plogits[i] > max_logit) {
1514
- max_logit = plogits[i];
1515
- max_id = i;
1516
- }
1525
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
1526
+ const int64_t t_start_sample_us = ggml_time_us();
1527
+
1528
+ k = std::max(k, (int) min_keep);
1529
+ k = std::min(k, (int) candidates->size);
1530
+
1531
+ // Sort scores in descending order
1532
+ if (!candidates->sorted) {
1533
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) {
1534
+ return a.logit > b.logit;
1535
+ };
1536
+ if (k == (int) candidates->size) {
1537
+ std::sort(candidates->data, candidates->data + candidates->size, comp);
1538
+ } else {
1539
+ std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
1517
1540
  }
1518
- return max_id;
1541
+ candidates->sorted = true;
1542
+ }
1543
+ candidates->size = k;
1544
+
1545
+ if (ctx) {
1546
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1519
1547
  }
1548
+ }
1520
1549
 
1521
- std::vector<std::pair<float, llama_vocab::id>> logits_id;
1522
- logits_id.reserve(n_logits);
1550
+ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1551
+ if (p >= 1.0f) {
1552
+ return;
1553
+ }
1523
1554
 
1524
- {
1525
- const float scale = 1.0f/temp;
1526
- for (int i = 0; i < n_logits; ++i) {
1527
- // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1528
- // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1529
- if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1530
- // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1531
- if (plogits[i] < 0.0f) {
1532
- logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1533
- } else {
1534
- logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1535
- }
1536
- } else {
1537
- logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1538
- }
1555
+ const int64_t t_start_sample_us = ggml_time_us();
1556
+
1557
+ llama_sample_softmax(ctx, candidates);
1558
+
1559
+ // Compute the cumulative probabilities
1560
+ float cum_sum = 0.0f;
1561
+ size_t last_idx = candidates->size;
1562
+
1563
+ for (size_t i = 0; i < candidates->size; ++i) {
1564
+ cum_sum += candidates->data[i].p;
1565
+
1566
+ // Check if the running sum is greater than p or if we have kept at least min_keep tokens
1567
+ if (cum_sum > p && i >= min_keep) {
1568
+ last_idx = i;
1569
+ break;
1539
1570
  }
1540
1571
  }
1541
1572
 
1542
- sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
1573
+ // Resize the output vector to keep only the top-p tokens
1574
+ candidates->size = last_idx;
1543
1575
 
1544
- // compute probs for the top k tokens
1545
- std::vector<float> probs;
1546
- probs.reserve(logits_id.size());
1576
+ if (ctx) {
1577
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1578
+ }
1579
+ }
1547
1580
 
1548
- float maxl = logits_id[0].first;
1549
- double sum = 0.0;
1550
- for (const auto & kv : logits_id) {
1551
- const float p = expf(kv.first - maxl);
1552
- probs.push_back(p);
1553
- sum += p;
1581
+ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
1582
+ if (z >= 1.0f || candidates->size <= 2) {
1583
+ return;
1554
1584
  }
1555
1585
 
1556
- // normalize the probs
1557
- for (auto & p : probs) {
1558
- p /= sum;
1586
+ const int64_t t_start_sample_us = ggml_time_us();
1587
+
1588
+ llama_sample_softmax(nullptr, candidates);
1589
+
1590
+ // Compute the first and second derivatives
1591
+ std::vector<float> first_derivatives(candidates->size - 1);
1592
+ std::vector<float> second_derivatives(candidates->size - 2);
1593
+
1594
+ for (size_t i = 0; i < first_derivatives.size(); ++i) {
1595
+ first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
1596
+ }
1597
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1598
+ second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
1559
1599
  }
1560
1600
 
1561
- if (top_p < 1.0) {
1562
- double cumsum = 0.0;
1563
- for (int i = 0; i < (int) probs.size(); i++) {
1564
- cumsum += probs[i];
1565
- if (cumsum >= top_p) {
1566
- probs.resize(i + 1);
1567
- logits_id.resize(i + 1);
1568
- break;
1569
- }
1601
+ // Calculate absolute value of second derivatives
1602
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1603
+ second_derivatives[i] = abs(second_derivatives[i]);
1604
+ }
1605
+
1606
+ // Normalize the second derivatives
1607
+ float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1608
+ for (float & value : second_derivatives) {
1609
+ value /= second_derivatives_sum;
1610
+ }
1611
+
1612
+ float cum_sum = 0.0f;
1613
+ size_t last_idx = candidates->size;
1614
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1615
+ cum_sum += second_derivatives[i];
1616
+
1617
+ // Check if the running sum is greater than z or if we have kept at least min_keep tokens
1618
+ if (cum_sum > z && i >= min_keep) {
1619
+ last_idx = i;
1620
+ break;
1570
1621
  }
1571
1622
  }
1572
1623
 
1573
- //printf("\n");
1574
- //for (int i = 0; i < (int) 10; i++) {
1575
- // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1576
- //}
1577
- //printf("\n\n");
1578
- //exit(0);
1624
+ // Resize the output vector to keep only the tokens above the tail location
1625
+ candidates->size = last_idx;
1626
+
1627
+ if (ctx) {
1628
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1629
+ }
1630
+ }
1631
+
1632
+
1633
+ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1634
+ // Reference implementation:
1635
+ // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
1636
+ if (p >= 1.0f) {
1637
+ return;
1638
+ }
1639
+
1640
+ const int64_t t_start_sample_us = ggml_time_us();
1641
+
1642
+ // Compute the softmax of logits and calculate entropy
1643
+ llama_sample_softmax(nullptr, candidates);
1644
+
1645
+ float entropy = 0.0f;
1646
+ for (size_t i = 0; i < candidates->size; ++i) {
1647
+ entropy += -candidates->data[i].p * logf(candidates->data[i].p);
1648
+ }
1649
+
1650
+ // Compute the absolute difference between negative log probability and entropy for each candidate
1651
+ std::vector<float> shifted_scores;
1652
+ for (size_t i = 0; i < candidates->size; ++i) {
1653
+ float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
1654
+ shifted_scores.push_back(shifted_score);
1655
+ }
1656
+
1657
+ // Sort tokens based on the shifted_scores and their corresponding indices
1658
+ std::vector<size_t> indices(candidates->size);
1659
+ std::iota(indices.begin(), indices.end(), 0);
1660
+
1661
+ std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
1662
+ return shifted_scores[a] < shifted_scores[b];
1663
+ });
1664
+
1665
+ // Compute the cumulative probabilities
1666
+ float cum_sum = 0.0f;
1667
+ size_t last_idx = indices.size();
1668
+
1669
+ for (size_t i = 0; i < indices.size(); ++i) {
1670
+ size_t idx = indices[i];
1671
+ cum_sum += candidates->data[idx].p;
1672
+
1673
+ // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
1674
+ if (cum_sum > p && i >= min_keep - 1) {
1675
+ last_idx = i + 1;
1676
+ break;
1677
+ }
1678
+ }
1679
+
1680
+ // Resize the output vector to keep only the locally typical tokens
1681
+ std::vector<llama_token_data> new_candidates;
1682
+ for (size_t i = 0; i < last_idx; ++i) {
1683
+ size_t idx = indices[i];
1684
+ new_candidates.push_back(candidates->data[idx]);
1685
+ }
1686
+
1687
+ // Replace the data in candidates with the new_candidates data
1688
+ std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
1689
+ candidates->size = new_candidates.size();
1690
+
1691
+ if (ctx) {
1692
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1693
+ }
1694
+ }
1695
+
1696
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
1697
+ const int64_t t_start_sample_us = ggml_time_us();
1698
+
1699
+ for (size_t i = 0; i < candidates_p->size; ++i) {
1700
+ candidates_p->data[i].logit /= temp;
1701
+ }
1702
+
1703
+ if (ctx) {
1704
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1705
+ }
1706
+ }
1707
+
1708
+ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1709
+ if (last_tokens_size == 0 || penalty == 1.0f) {
1710
+ return;
1711
+ }
1712
+
1713
+ const int64_t t_start_sample_us = ggml_time_us();
1714
+
1715
+ for (size_t i = 0; i < candidates->size; ++i) {
1716
+ auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1717
+ if (token_iter == last_tokens + last_tokens_size) {
1718
+ continue;
1719
+ }
1720
+
1721
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1722
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1723
+ if (candidates->data[i].logit <= 0) {
1724
+ candidates->data[i].logit *= penalty;
1725
+ } else {
1726
+ candidates->data[i].logit /= penalty;
1727
+ }
1728
+ }
1729
+
1730
+ candidates->sorted = false;
1731
+
1732
+ if (ctx) {
1733
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1734
+ }
1735
+ }
1736
+
1737
+ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1738
+ if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1739
+ return;
1740
+ }
1741
+
1742
+ const int64_t t_start_sample_us = ggml_time_us();
1743
+
1744
+ // Create a frequency map to count occurrences of each token in last_tokens
1745
+ std::unordered_map<llama_token, int> token_count;
1746
+ for (size_t i = 0; i < last_tokens_size; ++i) {
1747
+ token_count[last_tokens_p[i]]++;
1748
+ }
1749
+
1750
+ // Apply frequency and presence penalties to the candidates
1751
+ for (size_t i = 0; i < candidates->size; ++i) {
1752
+ auto token_iter = token_count.find(candidates->data[i].id);
1753
+ if (token_iter == token_count.end()) {
1754
+ continue;
1755
+ }
1756
+
1757
+ int count = token_iter->second;
1758
+ candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
1759
+ }
1760
+
1761
+ candidates->sorted = false;
1762
+
1763
+ if (ctx) {
1764
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1765
+ }
1766
+ }
1767
+
1768
+
1769
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
1770
+ assert(ctx);
1771
+ auto N = float(llama_n_vocab(ctx));
1772
+ int64_t t_start_sample_us;
1773
+ t_start_sample_us = ggml_time_us();
1774
+
1775
+ llama_sample_softmax(nullptr, candidates);
1776
+
1777
+ // Estimate s_hat using the most probable m tokens
1778
+ float s_hat = 0.0;
1779
+ float sum_ti_bi = 0.0;
1780
+ float sum_ti_sq = 0.0;
1781
+ for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
1782
+ float t_i = logf(float(i + 2) / float(i + 1));
1783
+ float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
1784
+ sum_ti_bi += t_i * b_i;
1785
+ sum_ti_sq += t_i * t_i;
1786
+ }
1787
+ s_hat = sum_ti_bi / sum_ti_sq;
1788
+
1789
+ // Compute k from the estimated s_hat and target surprise value
1790
+ float epsilon_hat = s_hat - 1;
1791
+ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1792
+
1793
+ // Sample the next word X using top-k sampling
1794
+ llama_sample_top_k(nullptr, candidates, int(k));
1795
+ if (ctx) {
1796
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1797
+ }
1798
+ llama_token X = llama_sample_token(ctx, candidates);
1799
+ t_start_sample_us = ggml_time_us();
1800
+
1801
+ // Compute error as the difference between observed surprise and target surprise value
1802
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1803
+ return candidate.id == X;
1804
+ }));
1805
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1806
+ float e = observed_surprise - tau;
1807
+
1808
+ // Update mu using the learning rate and error
1809
+ *mu = *mu - eta * e;
1810
+
1811
+ if (ctx) {
1812
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1813
+ ctx->n_sample++;
1814
+ }
1815
+ return X;
1816
+ }
1817
+
1818
+ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
1819
+ assert(ctx);
1820
+ int64_t t_start_sample_us;
1821
+ t_start_sample_us = ggml_time_us();
1822
+
1823
+ llama_sample_softmax(ctx, candidates);
1824
+
1825
+ // Truncate the words with surprise values greater than mu
1826
+ candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1827
+ return -log2f(candidate.p) > *mu;
1828
+ }));
1829
+
1830
+ // Normalize the probabilities of the remaining words
1831
+ llama_sample_softmax(ctx, candidates);
1832
+
1833
+ // Sample the next word X from the remaining words
1834
+ if (ctx) {
1835
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1836
+ }
1837
+ llama_token X = llama_sample_token(ctx, candidates);
1838
+ t_start_sample_us = ggml_time_us();
1839
+
1840
+ // Compute error as the difference between observed surprise and target surprise value
1841
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1842
+ return candidate.id == X;
1843
+ }));
1844
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1845
+ float e = observed_surprise - tau;
1846
+
1847
+ // Update mu using the learning rate and error
1848
+ *mu = *mu - eta * e;
1849
+
1850
+ if (ctx) {
1851
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1852
+ }
1853
+ return X;
1854
+ }
1855
+
1856
+ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
1857
+ const int64_t t_start_sample_us = ggml_time_us();
1858
+
1859
+ // Find max element
1860
+ auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1861
+ return a.logit < b.logit;
1862
+ });
1863
+
1864
+ llama_token result = max_iter->id;
1865
+ if (ctx) {
1866
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1867
+ ctx->n_sample++;
1868
+ }
1869
+ return result;
1870
+ }
1871
+
1872
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
1873
+ assert(ctx);
1874
+ const int64_t t_start_sample_us = ggml_time_us();
1875
+ llama_sample_softmax(nullptr, candidates);
1876
+
1877
+ std::vector<float> probs;
1878
+ probs.reserve(candidates->size);
1879
+ for (size_t i = 0; i < candidates->size; ++i) {
1880
+ probs.push_back(candidates->data[i].p);
1881
+ }
1579
1882
 
1580
1883
  std::discrete_distribution<> dist(probs.begin(), probs.end());
1884
+ auto & rng = ctx->rng;
1581
1885
  int idx = dist(rng);
1582
1886
 
1583
- return logits_id[idx].second;
1887
+ llama_token result = candidates->data[idx].id;
1888
+
1889
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1890
+ ctx->n_sample++;
1891
+ return result;
1584
1892
  }
1585
1893
 
1586
1894
  //
@@ -1593,7 +1901,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1593
1901
  case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1594
1902
  case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1595
1903
  case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1596
- case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
1597
1904
  case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1598
1905
  case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1599
1906
  case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
@@ -1749,7 +2056,7 @@ struct llama_context * llama_init_from_file(
1749
2056
 
1750
2057
  llama_context * ctx = new llama_context;
1751
2058
 
1752
- if (params.seed <= 0) {
2059
+ if (params.seed < 0) {
1753
2060
  params.seed = time(NULL);
1754
2061
  }
1755
2062
 
@@ -2084,21 +2391,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2084
2391
  }
2085
2392
  }
2086
2393
 
2087
- int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2394
+ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2088
2395
  return ctx->model.kv_self.n;
2089
2396
  }
2090
2397
 
2091
2398
  #define LLAMA_MAX_RNG_STATE 64*1024
2092
2399
 
2093
2400
  void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2094
- if (seed <= 0) {
2401
+ if (seed < 0) {
2095
2402
  seed = time(NULL);
2096
2403
  }
2097
2404
  ctx->rng.seed(seed);
2098
2405
  }
2099
2406
 
2100
- // Returns the size of the state
2101
- size_t llama_get_state_size(struct llama_context * ctx) {
2407
+ // Returns the *maximum* size of the state
2408
+ size_t llama_get_state_size(const struct llama_context * ctx) {
2102
2409
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2103
2410
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
2104
2411
  const size_t s_rng_size = sizeof(size_t);
@@ -2176,21 +2483,51 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2176
2483
 
2177
2484
  // copy kv cache
2178
2485
  {
2179
- const size_t kv_size = ctx->model.kv_self.buf.size;
2486
+ const auto & kv_self = ctx->model.kv_self;
2487
+ const auto & hparams = ctx->model.hparams;
2488
+ const int n_layer = hparams.n_layer;
2489
+ const int n_embd = hparams.n_embd;
2490
+ const int n_ctx = hparams.n_ctx;
2491
+
2492
+ const size_t kv_size = kv_self.buf.size;
2180
2493
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2181
2494
 
2182
2495
  memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2183
2496
  memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2184
2497
 
2185
2498
  if (kv_size) {
2186
- memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2499
+ const size_t elt_size = ggml_element_size(kv_self.k);
2500
+ char buffer[4096];
2501
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2502
+ ggml_cgraph gf{};
2503
+ gf.n_threads = 1;
2504
+
2505
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2506
+ kout3d->data = out;
2507
+ out += ggml_nbytes(kout3d);
2508
+
2509
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2510
+ vout3d->data = out;
2511
+ out += ggml_nbytes(vout3d);
2512
+
2513
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2514
+ n_embd, kv_ntok, n_layer,
2515
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2516
+
2517
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2518
+ kv_ntok, n_embd, n_layer,
2519
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2520
+
2521
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2522
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2523
+ ggml_graph_compute(cpy_ctx, &gf);
2187
2524
  }
2188
2525
  }
2189
2526
 
2190
2527
  const size_t written = out - dest;
2191
- const size_t expected = llama_get_state_size(ctx);
2528
+ const size_t max_size = llama_get_state_size(ctx);
2192
2529
 
2193
- LLAMA_ASSERT(written == expected);
2530
+ LLAMA_ASSERT(written <= max_size);
2194
2531
 
2195
2532
  return written;
2196
2533
  }
@@ -2248,6 +2585,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2248
2585
 
2249
2586
  // set kv cache
2250
2587
  {
2588
+ const auto & kv_self = ctx->model.kv_self;
2589
+ const auto & hparams = ctx->model.hparams;
2590
+ const int n_layer = hparams.n_layer;
2591
+ const int n_embd = hparams.n_embd;
2592
+ const int n_ctx = hparams.n_ctx;
2593
+
2251
2594
  size_t kv_size;
2252
2595
  int kv_ntok;
2253
2596
 
@@ -2255,29 +2598,125 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2255
2598
  memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2256
2599
 
2257
2600
  if (kv_size) {
2258
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2601
+ LLAMA_ASSERT(kv_self.buf.size == kv_size);
2602
+
2603
+ const size_t elt_size = ggml_element_size(kv_self.k);
2604
+ char buffer[4096];
2605
+ ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2606
+ ggml_cgraph gf{};
2607
+ gf.n_threads = 1;
2608
+
2609
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2610
+ kin3d->data = (void *) in;
2611
+ in += ggml_nbytes(kin3d);
2259
2612
 
2260
- void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2261
- void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2613
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
2614
+ vin3d->data = (void *) in;
2615
+ in += ggml_nbytes(vin3d);
2262
2616
 
2263
- memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2617
+ ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
2618
+ n_embd, kv_ntok, n_layer,
2619
+ elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
2264
2620
 
2265
- ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2266
- ctx->model.kv_self.v->data = v_data;
2621
+ ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
2622
+ kv_ntok, n_embd, n_layer,
2623
+ elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
2267
2624
 
2625
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2626
+ ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2627
+ ggml_graph_compute(cpy_ctx, &gf);
2268
2628
  }
2269
2629
 
2270
2630
  ctx->model.kv_self.n = kv_ntok;
2271
2631
  }
2272
2632
 
2273
2633
  const size_t nread = in - src;
2274
- const size_t expected = llama_get_state_size(ctx);
2634
+ const size_t max_size = llama_get_state_size(ctx);
2275
2635
 
2276
- LLAMA_ASSERT(nread == expected);
2636
+ LLAMA_ASSERT(nread <= max_size);
2277
2637
 
2278
2638
  return nread;
2279
2639
  }
2280
2640
 
2641
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2642
+ llama_file file(path_session, "rb");
2643
+
2644
+ // sanity checks
2645
+ {
2646
+ const uint32_t magic = file.read_u32();
2647
+ const uint32_t version = file.read_u32();
2648
+
2649
+ if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2650
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2651
+ return false;
2652
+ }
2653
+
2654
+ llama_hparams session_hparams;
2655
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
2656
+
2657
+ if (session_hparams != ctx->model.hparams) {
2658
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2659
+ return false;
2660
+ }
2661
+ }
2662
+
2663
+ // load the prompt
2664
+ {
2665
+ const uint32_t n_token_count = file.read_u32();
2666
+
2667
+ if (n_token_count > n_token_capacity) {
2668
+ fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2669
+ return false;
2670
+ }
2671
+
2672
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2673
+ *n_token_count_out = n_token_count;
2674
+ }
2675
+
2676
+ // restore the context state
2677
+ {
2678
+ const size_t n_state_size_cur = file.size - file.tell();
2679
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2680
+
2681
+ if (n_state_size_cur > n_state_size_max) {
2682
+ fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
2683
+ return false;
2684
+ }
2685
+
2686
+ std::vector<uint8_t> state_data(n_state_size_max);
2687
+ file.read_raw(state_data.data(), n_state_size_cur);
2688
+
2689
+ llama_set_state_data(ctx, state_data.data());
2690
+ }
2691
+
2692
+ return true;
2693
+ }
2694
+
2695
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2696
+ llama_file file(path_session, "wb");
2697
+
2698
+ file.write_u32(LLAMA_SESSION_MAGIC);
2699
+ file.write_u32(LLAMA_SESSION_VERSION);
2700
+
2701
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2702
+
2703
+ // save the prompt
2704
+ file.write_u32((uint32_t) n_token_count);
2705
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2706
+
2707
+ // save the context state
2708
+ {
2709
+ const size_t n_state_size_max = llama_get_state_size(ctx);
2710
+
2711
+ std::vector<uint8_t> state_data(n_state_size_max);
2712
+ const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
2713
+
2714
+ file.write_raw(state_data.data(), n_state_size_cur);
2715
+ }
2716
+
2717
+ return true;
2718
+ }
2719
+
2281
2720
  int llama_eval(
2282
2721
  struct llama_context * ctx,
2283
2722
  const llama_token * tokens,
@@ -2316,15 +2755,15 @@ int llama_tokenize(
2316
2755
  return res.size();
2317
2756
  }
2318
2757
 
2319
- int llama_n_vocab(struct llama_context * ctx) {
2758
+ int llama_n_vocab(const struct llama_context * ctx) {
2320
2759
  return ctx->vocab.id_to_token.size();
2321
2760
  }
2322
2761
 
2323
- int llama_n_ctx(struct llama_context * ctx) {
2762
+ int llama_n_ctx(const struct llama_context * ctx) {
2324
2763
  return ctx->model.hparams.n_ctx;
2325
2764
  }
2326
2765
 
2327
- int llama_n_embd(struct llama_context * ctx) {
2766
+ int llama_n_embd(const struct llama_context * ctx) {
2328
2767
  return ctx->model.hparams.n_embd;
2329
2768
  }
2330
2769
 
@@ -2336,7 +2775,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
2336
2775
  return ctx->embedding.data();
2337
2776
  }
2338
2777
 
2339
- const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
2778
+ const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
2340
2779
  if (token >= llama_n_vocab(ctx)) {
2341
2780
  return nullptr;
2342
2781
  }
@@ -2352,33 +2791,8 @@ llama_token llama_token_eos() {
2352
2791
  return 2;
2353
2792
  }
2354
2793
 
2355
- llama_token llama_sample_top_p_top_k(
2356
- llama_context * ctx,
2357
- const llama_token * last_n_tokens_data,
2358
- int last_n_tokens_size,
2359
- int top_k,
2360
- float top_p,
2361
- float temp,
2362
- float repeat_penalty) {
2363
- const int64_t t_start_sample_us = ggml_time_us();
2364
-
2365
- llama_token result = 0;
2366
-
2367
- // TODO: avoid this ...
2368
- const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
2369
-
2370
- result = llama_sample_top_p_top_k(
2371
- *ctx,
2372
- last_n_tokens,
2373
- top_k,
2374
- top_p,
2375
- temp,
2376
- repeat_penalty);
2377
-
2378
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2379
- ctx->n_sample++;
2380
-
2381
- return result;
2794
+ llama_token llama_token_nl() {
2795
+ return 13;
2382
2796
  }
2383
2797
 
2384
2798
 
@@ -2430,4 +2844,3 @@ const char * llama_print_system_info(void) {
2430
2844
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2431
2845
  return ctx->model.tensors_by_name;
2432
2846
  }
2433
-