cui-llama.rn 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/android/src/main/CMakeLists.txt +5 -7
  2. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
  3. package/android/src/main/jni.cpp +9 -9
  4. package/cpp/common.cpp +28 -44
  5. package/cpp/common.h +35 -14
  6. package/cpp/ggml-alloc.c +0 -1
  7. package/cpp/ggml-backend-impl.h +38 -20
  8. package/cpp/ggml-backend-reg.cpp +246 -92
  9. package/cpp/ggml-backend.h +1 -0
  10. package/cpp/ggml-common.h +42 -48
  11. package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +642 -223
  12. package/cpp/ggml-cpu-aarch64.h +2 -26
  13. package/cpp/ggml-cpu-traits.cpp +36 -0
  14. package/cpp/ggml-cpu-traits.h +38 -0
  15. package/cpp/ggml-cpu.c +14122 -13971
  16. package/cpp/ggml-cpu.cpp +627 -715
  17. package/cpp/ggml-cpu.h +0 -17
  18. package/cpp/ggml-impl.h +22 -6
  19. package/cpp/ggml-metal.m +482 -24
  20. package/cpp/ggml-quants.c +0 -9
  21. package/cpp/ggml-threading.h +4 -2
  22. package/cpp/ggml.c +284 -178
  23. package/cpp/ggml.h +73 -25
  24. package/cpp/llama-grammar.cpp +15 -15
  25. package/cpp/llama-grammar.h +2 -5
  26. package/cpp/llama-sampling.cpp +35 -90
  27. package/cpp/llama-vocab.cpp +7 -2
  28. package/cpp/llama-vocab.h +1 -1
  29. package/cpp/llama.cpp +1782 -586
  30. package/cpp/llama.h +20 -19
  31. package/cpp/sampling.cpp +11 -16
  32. package/cpp/sgemm.cpp +265 -258
  33. package/cpp/sgemm.h +2 -2
  34. package/cpp/speculative.cpp +4 -0
  35. package/cpp/unicode.cpp +51 -51
  36. package/cpp/unicode.h +9 -10
  37. package/lib/commonjs/index.js +38 -1
  38. package/lib/commonjs/index.js.map +1 -1
  39. package/lib/module/index.js +36 -0
  40. package/lib/module/index.js.map +1 -1
  41. package/lib/typescript/NativeRNLlama.d.ts +2 -3
  42. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  43. package/lib/typescript/index.d.ts +36 -2
  44. package/lib/typescript/index.d.ts.map +1 -1
  45. package/package.json +1 -1
  46. package/src/NativeRNLlama.ts +3 -3
  47. package/src/index.ts +46 -2
  48. package/cpp/amx/amx.cpp +0 -196
  49. package/cpp/amx/amx.h +0 -20
  50. package/cpp/amx/common.h +0 -101
  51. package/cpp/amx/mmq.cpp +0 -2524
  52. package/cpp/amx/mmq.h +0 -16
  53. package/cpp/ggml-aarch64.c +0 -129
  54. package/cpp/ggml-aarch64.h +0 -19
package/cpp/ggml.h CHANGED
@@ -238,7 +238,9 @@
238
238
  #define LM_GGML_EXIT_SUCCESS 0
239
239
  #define LM_GGML_EXIT_ABORTED 1
240
240
 
241
- #define LM_GGML_ROPE_TYPE_NEOX 2
241
+ #define LM_GGML_ROPE_TYPE_NEOX 2
242
+ #define LM_GGML_ROPE_TYPE_MROPE 8
243
+ #define LM_GGML_ROPE_TYPE_VISION 24
242
244
 
243
245
  #define LM_GGUF_MAGIC "GGUF"
244
246
 
@@ -385,15 +387,15 @@ extern "C" {
385
387
  LM_GGML_TYPE_F64 = 28,
386
388
  LM_GGML_TYPE_IQ1_M = 29,
387
389
  LM_GGML_TYPE_BF16 = 30,
388
- LM_GGML_TYPE_Q4_0_4_4 = 31,
389
- LM_GGML_TYPE_Q4_0_4_8 = 32,
390
- LM_GGML_TYPE_Q4_0_8_8 = 33,
390
+ // LM_GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
391
+ // LM_GGML_TYPE_Q4_0_4_8 = 32,
392
+ // LM_GGML_TYPE_Q4_0_8_8 = 33,
391
393
  LM_GGML_TYPE_TQ1_0 = 34,
392
394
  LM_GGML_TYPE_TQ2_0 = 35,
393
- LM_GGML_TYPE_IQ4_NL_4_4 = 36,
395
+ // LM_GGML_TYPE_IQ4_NL_4_4 = 36,
394
396
  // LM_GGML_TYPE_IQ4_NL_4_8 = 37,
395
397
  // LM_GGML_TYPE_IQ4_NL_8_8 = 38,
396
- LM_GGML_TYPE_COUNT,
398
+ LM_GGML_TYPE_COUNT = 39,
397
399
  };
398
400
 
399
401
  // precision
@@ -434,9 +436,6 @@ extern "C" {
434
436
  LM_GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
435
437
  LM_GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
436
438
  LM_GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
437
- LM_GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
438
- LM_GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
439
- LM_GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
440
439
  };
441
440
 
442
441
  // available tensor operations:
@@ -500,6 +499,7 @@ extern "C" {
500
499
  LM_GGML_OP_POOL_2D_BACK,
501
500
  LM_GGML_OP_UPSCALE, // nearest interpolate
502
501
  LM_GGML_OP_PAD,
502
+ LM_GGML_OP_PAD_REFLECT_1D,
503
503
  LM_GGML_OP_ARANGE,
504
504
  LM_GGML_OP_TIMESTEP_EMBEDDING,
505
505
  LM_GGML_OP_ARGSORT,
@@ -1446,6 +1446,22 @@ extern "C" {
1446
1446
  float beta_fast,
1447
1447
  float beta_slow);
1448
1448
 
1449
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_multi(
1450
+ struct lm_ggml_context * ctx,
1451
+ struct lm_ggml_tensor * a,
1452
+ struct lm_ggml_tensor * b,
1453
+ struct lm_ggml_tensor * c,
1454
+ int n_dims,
1455
+ int sections[4],
1456
+ int mode,
1457
+ int n_ctx_orig,
1458
+ float freq_base,
1459
+ float freq_scale,
1460
+ float ext_factor,
1461
+ float attn_factor,
1462
+ float beta_fast,
1463
+ float beta_slow);
1464
+
1449
1465
  // in-place, returns view(a)
1450
1466
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext_inplace(
1451
1467
  struct lm_ggml_context * ctx,
@@ -1549,17 +1565,6 @@ extern "C" {
1549
1565
  int d1, // dilation dimension 1
1550
1566
  bool is_2D);
1551
1567
 
1552
- LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_depthwise_2d(
1553
- struct lm_ggml_context * ctx,
1554
- struct lm_ggml_tensor * a, // convolution kernel
1555
- struct lm_ggml_tensor * b, // data
1556
- int s0, // stride dimension 0
1557
- int s1, // stride dimension 1
1558
- int p0, // padding dimension 0
1559
- int p1, // padding dimension 1
1560
- int d0, // dilation dimension 0
1561
- int d1); // dilation dimension 1
1562
-
1563
1568
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
1564
1569
  struct lm_ggml_context * ctx,
1565
1570
  struct lm_ggml_tensor * a, // convolution kernel
@@ -1577,6 +1582,23 @@ extern "C" {
1577
1582
  int s, // stride
1578
1583
  int d); // dilation
1579
1584
 
1585
+ // depthwise
1586
+ // TODO: this is very likely wrong for some cases! - needs more testing
1587
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d_dw(
1588
+ struct lm_ggml_context * ctx,
1589
+ struct lm_ggml_tensor * a, // convolution kernel
1590
+ struct lm_ggml_tensor * b, // data
1591
+ int s0, // stride
1592
+ int p0, // padding
1593
+ int d0); // dilation
1594
+
1595
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d_dw_ph(
1596
+ struct lm_ggml_context * ctx,
1597
+ struct lm_ggml_tensor * a, // convolution kernel
1598
+ struct lm_ggml_tensor * b, // data
1599
+ int s0, // stride
1600
+ int d0); // dilation
1601
+
1580
1602
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
1581
1603
  struct lm_ggml_context * ctx,
1582
1604
  struct lm_ggml_tensor * a, // convolution kernel
@@ -1596,7 +1618,6 @@ extern "C" {
1596
1618
  int d0, // dilation dimension 0
1597
1619
  int d1); // dilation dimension 1
1598
1620
 
1599
-
1600
1621
  // kernel size is a->ne[0] x a->ne[1]
1601
1622
  // stride is equal to kernel size
1602
1623
  // padding is zero
@@ -1623,6 +1644,18 @@ extern "C" {
1623
1644
  struct lm_ggml_tensor * a,
1624
1645
  struct lm_ggml_tensor * b);
1625
1646
 
1647
+ // depthwise
1648
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw(
1649
+ struct lm_ggml_context * ctx,
1650
+ struct lm_ggml_tensor * a, // convolution kernel
1651
+ struct lm_ggml_tensor * b, // data
1652
+ int s0, // stride dimension 0
1653
+ int s1, // stride dimension 1
1654
+ int p0, // padding dimension 0
1655
+ int p1, // padding dimension 1
1656
+ int d0, // dilation dimension 0
1657
+ int d1); // dilation dimension 1
1658
+
1626
1659
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0(
1627
1660
  struct lm_ggml_context * ctx,
1628
1661
  struct lm_ggml_tensor * a,
@@ -1696,6 +1729,13 @@ extern "C" {
1696
1729
  int p2,
1697
1730
  int p3);
1698
1731
 
1732
+ // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
1733
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
1734
+ struct lm_ggml_context * ctx,
1735
+ struct lm_ggml_tensor * a,
1736
+ int p0,
1737
+ int p1);
1738
+
1699
1739
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1700
1740
  // timesteps: [N,]
1701
1741
  // return: [N, dim]
@@ -2198,11 +2238,19 @@ extern "C" {
2198
2238
  LM_GGML_API size_t lm_gguf_get_meta_size(const struct lm_gguf_context * ctx);
2199
2239
  LM_GGML_API void lm_gguf_get_meta_data(const struct lm_gguf_context * ctx, void * data);
2200
2240
 
2201
- #ifdef __cplusplus
2202
- // restrict not standard in C++
2203
- #define LM_GGML_RESTRICT
2241
+ #ifdef __cplusplus
2242
+ // restrict not standard in C++
2243
+ # if defined(__GNUC__)
2244
+ # define LM_GGML_RESTRICT __restrict__
2245
+ # elif defined(__clang__)
2246
+ # define LM_GGML_RESTRICT __restrict
2247
+ # elif defined(_MSC_VER)
2248
+ # define LM_GGML_RESTRICT __restrict
2249
+ # else
2250
+ # define LM_GGML_RESTRICT
2251
+ # endif
2204
2252
  #else
2205
- #define LM_GGML_RESTRICT restrict
2253
+ # define LM_GGML_RESTRICT restrict
2206
2254
  #endif
2207
2255
  typedef void (*lm_ggml_to_float_t) (const void * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int64_t k);
2208
2256
  typedef void (*lm_ggml_from_float_t)(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
@@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
822
822
  return grammar->stacks;
823
823
  }
824
824
 
825
- void llama_grammar_accept(
826
- const llama_grammar_rules & rules,
827
- const llama_grammar_stacks & stacks,
828
- const uint32_t chr,
829
- llama_grammar_stacks & stacks_new) {
830
- stacks_new.clear();
831
- stacks_new.reserve(stacks.size());
825
+ void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
826
+ llama_grammar_stacks stacks_new;
827
+ stacks_new.reserve(grammar->stacks.size());
832
828
 
833
- for (const auto & stack : stacks) {
829
+ for (const auto & stack : grammar->stacks) {
834
830
  if (stack.empty()) {
835
831
  continue;
836
832
  }
@@ -844,9 +840,11 @@ void llama_grammar_accept(
844
840
  if (!llama_grammar_is_end_of_sequence(pos)) {
845
841
  new_stack.push_back(pos);
846
842
  }
847
- llama_grammar_advance_stack(rules, new_stack, stacks_new);
843
+ llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
848
844
  }
849
845
  }
846
+
847
+ grammar->stacks = std::move(stacks_new);
850
848
  }
851
849
 
852
850
  llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
1051
1049
  }
1052
1050
 
1053
1051
  struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
1054
- llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
1052
+ llama_grammar * result = new llama_grammar {
1053
+ grammar.vocab,
1054
+ grammar.rules,
1055
+ grammar.stacks,
1056
+ grammar.partial_utf8,
1057
+ };
1055
1058
 
1056
1059
  // redirect elements in stacks to point to new rules
1057
1060
  for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -1059,7 +1062,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1059
1062
  for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
1060
1063
  for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
1061
1064
  if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
1062
- result->stacks[is][ie] = &result->rules[ir0][ir1];
1065
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
1063
1066
  }
1064
1067
  }
1065
1068
  }
@@ -1126,11 +1129,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1126
1129
  const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1127
1130
  const auto & code_points = decoded.first;
1128
1131
 
1129
- llama_grammar_stacks stacks_new;
1130
-
1131
1132
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
1132
- llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
1133
- grammar.stacks = std::move(stacks_new);
1133
+ llama_grammar_accept(&grammar, *it);
1134
1134
  }
1135
1135
 
1136
1136
  grammar.partial_utf8 = decoded.second;
@@ -58,6 +58,7 @@ using llama_grammar_rules = std::vector<llama_grammar_rule>;
58
58
  using llama_grammar_stacks = std::vector<llama_grammar_stack>;
59
59
  using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
60
60
 
61
+ // TODO: remove, needed for tests atm
61
62
  const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
62
63
  llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
63
64
 
@@ -65,11 +66,7 @@ const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar
65
66
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
66
67
  // produces the N possible stacks if the given char is accepted at those
67
68
  // positions
68
- void llama_grammar_accept(
69
- const llama_grammar_rules & rules,
70
- const llama_grammar_stacks & stacks,
71
- uint32_t chr,
72
- llama_grammar_stacks & stacks_new);
69
+ void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
73
70
 
74
71
  std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
75
72
  const llama_grammar_rules & rules,
@@ -1397,19 +1397,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
1397
1397
  // penalties
1398
1398
 
1399
1399
  struct llama_sampler_penalties {
1400
- const int32_t n_vocab;
1401
- const llama_token special_eos_id;
1402
- const llama_token linefeed_id;
1403
-
1404
1400
  const int32_t penalty_last_n;
1405
1401
  const float penalty_repeat;
1406
1402
  const float penalty_freq;
1407
1403
  const float penalty_present;
1408
1404
 
1409
- const bool penalize_nl;
1410
- const bool ignore_eos;
1411
-
1412
1405
  ring_buffer<llama_token> prev;
1406
+
1407
+ // a frequency map to count token occurrences
1408
+ std::unordered_map<llama_token, int> token_count;
1413
1409
  };
1414
1410
 
1415
1411
  static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1422,76 +1418,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
1422
1418
  return;
1423
1419
  }
1424
1420
 
1425
- ctx->prev.push_back(token);
1426
- }
1427
-
1428
- static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1429
- auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1421
+ ctx->token_count[token]++;
1430
1422
 
1431
- if (ctx->ignore_eos) {
1432
- assert(ctx->special_eos_id >= 0);
1423
+ // if the ring buffer is full, remove the oldest token
1424
+ if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
1425
+ const auto old = ctx->prev.front();
1433
1426
 
1434
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1435
- if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
1436
- cur_p->data[ctx->special_eos_id].logit = -INFINITY;
1437
- } else {
1438
- // else, search for the special EOS token
1439
- for (size_t i = 0; i < cur_p->size; ++i) {
1440
- if (cur_p->data[i].id == ctx->special_eos_id) {
1441
- cur_p->data[i].logit = -INFINITY;
1442
- break;
1443
- }
1444
- }
1427
+ ctx->token_count[old]--;
1428
+ if (ctx->token_count[old] == 0) {
1429
+ ctx->token_count.erase(old);
1445
1430
  }
1446
1431
  }
1447
1432
 
1448
- if ((ctx->penalty_last_n == 0) ||
1449
- (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1450
- return;
1451
- }
1452
-
1453
- bool nl_found = false;
1454
- size_t nl_idx = 0;
1455
- float nl_logit = -INFINITY;
1456
- if (!ctx->penalize_nl) {
1457
- assert(ctx->linefeed_id >= 0);
1433
+ ctx->prev.push_back(token);
1458
1434
 
1459
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1460
- if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
1461
- nl_found = true;
1462
- nl_idx = ctx->linefeed_id;
1463
- nl_logit = cur_p->data[ctx->linefeed_id].logit;
1464
- } else {
1465
- // else, search for the linefeed token
1466
- for (size_t i = 0; i < cur_p->size; ++i) {
1467
- if (cur_p->data[i].id == ctx->linefeed_id) {
1468
- nl_found = true;
1469
- nl_idx = i;
1470
- nl_logit = cur_p->data[i].logit;
1471
- break;
1472
- }
1473
- }
1474
- }
1435
+ #if 0
1436
+ // sanity check
1437
+ std::unordered_map<llama_token, int> tmp;
1438
+ for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1439
+ tmp[ctx->prev.rat(i)]++;
1475
1440
  }
1476
1441
 
1477
- // Create a frequency map to count occurrences of each token in last_tokens
1478
- // TODO: optimize this by maintaining the token count in the sampler context
1479
- using llama_token_cnt = std::unordered_map<llama_token, int>;
1480
- llama_token_cnt token_count;
1442
+ assert(ctx->token_count == tmp);
1443
+ #endif
1444
+ }
1445
+
1446
+ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1447
+ auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1481
1448
 
1482
- for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1483
- token_count[ctx->prev.rat(i)]++;
1449
+ if ((ctx->penalty_last_n == 0) ||
1450
+ (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1451
+ return;
1484
1452
  }
1485
1453
 
1486
1454
  // Apply frequency and presence penalties to the cur_p
1487
1455
  for (size_t i = 0; i < cur_p->size; ++i) {
1488
- const auto token_iter = token_count.find(cur_p->data[i].id);
1489
- if (token_iter == token_count.end()) {
1456
+ const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
1457
+ if (token_iter == ctx->token_count.end()) {
1490
1458
  continue;
1491
1459
  }
1492
1460
 
1493
1461
  const int count = token_iter->second;
1494
1462
 
1463
+ assert(count > 0 && count <= ctx->penalty_last_n);
1464
+
1495
1465
  // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1496
1466
  // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1497
1467
  if (cur_p->data[i].logit <= 0) {
@@ -1504,30 +1474,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
1504
1474
  }
1505
1475
 
1506
1476
  cur_p->sorted = false;
1507
-
1508
- if (!ctx->penalize_nl && nl_found) {
1509
- // restore the logit of the newline token if it was penalized
1510
- cur_p->data[nl_idx].logit = nl_logit;
1511
- }
1512
1477
  }
1513
1478
 
1514
1479
  static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
1515
1480
  auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1516
1481
  ctx->prev.clear();
1482
+ ctx->token_count.clear();
1517
1483
  }
1518
1484
 
1519
1485
  static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
1520
1486
  const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
1521
1487
  auto * result = llama_sampler_init_penalties(
1522
- ctx->n_vocab,
1523
- ctx->special_eos_id,
1524
- ctx->linefeed_id,
1525
1488
  ctx->penalty_last_n,
1526
1489
  ctx->penalty_repeat,
1527
1490
  ctx->penalty_freq,
1528
- ctx->penalty_present,
1529
- ctx->penalize_nl,
1530
- ctx->ignore_eos);
1491
+ ctx->penalty_present);
1531
1492
 
1532
1493
  // copy the state
1533
1494
  {
@@ -1553,38 +1514,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
1553
1514
  };
1554
1515
 
1555
1516
  struct llama_sampler * llama_sampler_init_penalties(
1556
- int32_t n_vocab,
1557
- llama_token special_eos_id,
1558
- llama_token linefeed_id,
1559
1517
  int32_t penalty_last_n,
1560
1518
  float penalty_repeat,
1561
1519
  float penalty_freq,
1562
- float penalty_present,
1563
- bool penalize_nl,
1564
- bool ignore_eos) {
1565
- if (linefeed_id == LLAMA_TOKEN_NULL) {
1566
- penalize_nl = true;
1567
- }
1568
-
1569
- if (special_eos_id == LLAMA_TOKEN_NULL) {
1570
- ignore_eos = false;
1571
- }
1572
-
1520
+ float penalty_present) {
1573
1521
  penalty_last_n = std::max(penalty_last_n, 0);
1574
1522
 
1575
1523
  return new llama_sampler {
1576
1524
  /* .iface = */ &llama_sampler_penalties_i,
1577
1525
  /* .ctx = */ new llama_sampler_penalties {
1578
- /* .n_vocab = */ n_vocab,
1579
- /* .special_eos_id = */ special_eos_id,
1580
- /* .linefeed_id = */ linefeed_id,
1581
1526
  /* .penalty_last_n = */ penalty_last_n,
1582
1527
  /* .penalty_repeat = */ penalty_repeat,
1583
1528
  /* .penalty_freq = */ penalty_freq,
1584
1529
  /* .penalty_present = */ penalty_present,
1585
- /* .penalize_nl = */ penalize_nl,
1586
- /* .ignore_eos = */ ignore_eos,
1587
1530
  /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
1531
+ /* .token_count = */ {},
1588
1532
  },
1589
1533
  };
1590
1534
  }
@@ -1612,7 +1556,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
1612
1556
  if (word.find(str) != std::string::npos) {
1613
1557
  token_sequences.emplace(token_id, std::vector<llama_token>());
1614
1558
  } else {
1615
- size_t word_len = word.size(), str_len = str.size();
1559
+ size_t word_len = word.size();
1560
+ size_t str_len = str.size();
1616
1561
  size_t pos = -1;
1617
1562
  while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
1618
1563
  bool match = true;
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
418
418
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
419
419
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
420
420
  case LLAMA_VOCAB_PRE_TYPE_EXAONE:
421
+ case LLAMA_VOCAB_PRE_TYPE_MINERVA:
421
422
  regex_exprs = {
422
423
  "\\p{N}",
423
424
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
737
738
  std::vector<std::string> words(1, "");
738
739
 
739
740
  for (const uint32_t cpt : cpts_nfd) {
740
- const auto flags = unicode_cpt_flags(cpt);
741
+ const auto flags = unicode_cpt_flags_from_cpt(cpt);
741
742
 
742
743
  if (flags.is_whitespace) {
743
744
  if (words.back().size()) { // finish previous word if any
@@ -1656,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
1656
1657
  }
1657
1658
 
1658
1659
  llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
1659
- return vocab.special_bos_id;
1660
+ return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
1660
1661
  }
1661
1662
 
1662
1663
  llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
@@ -1866,6 +1867,10 @@ int32_t llama_detokenize_impl(
1866
1867
  int32_t text_len_max,
1867
1868
  bool remove_special,
1868
1869
  bool unparse_special) {
1870
+ if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
1871
+ return 0;
1872
+ }
1873
+
1869
1874
  LM_GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
1870
1875
 
1871
1876
  int32_t avail = text_len_max;
package/cpp/llama-vocab.h CHANGED
@@ -45,7 +45,7 @@ struct llama_vocab {
45
45
  id special_unk_id = 0;
46
46
  id special_sep_id = LLAMA_TOKEN_NULL;
47
47
  id special_pad_id = LLAMA_TOKEN_NULL;
48
- id special_cls_id = LLAMA_TOKEN_NULL;
48
+ id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
49
49
  id special_mask_id = LLAMA_TOKEN_NULL;
50
50
 
51
51
  id linefeed_id = 13;