cui-llama.rn 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +5 -7
- package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
- package/android/src/main/jni.cpp +9 -9
- package/cpp/common.cpp +28 -44
- package/cpp/common.h +35 -14
- package/cpp/ggml-alloc.c +0 -1
- package/cpp/ggml-backend-impl.h +38 -20
- package/cpp/ggml-backend-reg.cpp +246 -92
- package/cpp/ggml-backend.h +1 -0
- package/cpp/ggml-common.h +42 -48
- package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +642 -223
- package/cpp/ggml-cpu-aarch64.h +2 -26
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +14122 -13971
- package/cpp/ggml-cpu.cpp +627 -715
- package/cpp/ggml-cpu.h +0 -17
- package/cpp/ggml-impl.h +22 -6
- package/cpp/ggml-metal.m +482 -24
- package/cpp/ggml-quants.c +0 -9
- package/cpp/ggml-threading.h +4 -2
- package/cpp/ggml.c +284 -178
- package/cpp/ggml.h +73 -25
- package/cpp/llama-grammar.cpp +15 -15
- package/cpp/llama-grammar.h +2 -5
- package/cpp/llama-sampling.cpp +35 -90
- package/cpp/llama-vocab.cpp +7 -2
- package/cpp/llama-vocab.h +1 -1
- package/cpp/llama.cpp +1782 -586
- package/cpp/llama.h +20 -19
- package/cpp/sampling.cpp +11 -16
- package/cpp/sgemm.cpp +265 -258
- package/cpp/sgemm.h +2 -2
- package/cpp/speculative.cpp +4 -0
- package/cpp/unicode.cpp +51 -51
- package/cpp/unicode.h +9 -10
- package/lib/commonjs/index.js +38 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/index.js +36 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +2 -3
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +36 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +3 -3
- package/src/index.ts +46 -2
- package/cpp/amx/amx.cpp +0 -196
- package/cpp/amx/amx.h +0 -20
- package/cpp/amx/common.h +0 -101
- package/cpp/amx/mmq.cpp +0 -2524
- package/cpp/amx/mmq.h +0 -16
- package/cpp/ggml-aarch64.c +0 -129
- package/cpp/ggml-aarch64.h +0 -19
package/cpp/ggml.h
CHANGED
@@ -238,7 +238,9 @@
|
|
238
238
|
#define LM_GGML_EXIT_SUCCESS 0
|
239
239
|
#define LM_GGML_EXIT_ABORTED 1
|
240
240
|
|
241
|
-
#define LM_GGML_ROPE_TYPE_NEOX
|
241
|
+
#define LM_GGML_ROPE_TYPE_NEOX 2
|
242
|
+
#define LM_GGML_ROPE_TYPE_MROPE 8
|
243
|
+
#define LM_GGML_ROPE_TYPE_VISION 24
|
242
244
|
|
243
245
|
#define LM_GGUF_MAGIC "GGUF"
|
244
246
|
|
@@ -385,15 +387,15 @@ extern "C" {
|
|
385
387
|
LM_GGML_TYPE_F64 = 28,
|
386
388
|
LM_GGML_TYPE_IQ1_M = 29,
|
387
389
|
LM_GGML_TYPE_BF16 = 30,
|
388
|
-
LM_GGML_TYPE_Q4_0_4_4 = 31,
|
389
|
-
LM_GGML_TYPE_Q4_0_4_8 = 32,
|
390
|
-
LM_GGML_TYPE_Q4_0_8_8 = 33,
|
390
|
+
// LM_GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
391
|
+
// LM_GGML_TYPE_Q4_0_4_8 = 32,
|
392
|
+
// LM_GGML_TYPE_Q4_0_8_8 = 33,
|
391
393
|
LM_GGML_TYPE_TQ1_0 = 34,
|
392
394
|
LM_GGML_TYPE_TQ2_0 = 35,
|
393
|
-
LM_GGML_TYPE_IQ4_NL_4_4 = 36,
|
395
|
+
// LM_GGML_TYPE_IQ4_NL_4_4 = 36,
|
394
396
|
// LM_GGML_TYPE_IQ4_NL_4_8 = 37,
|
395
397
|
// LM_GGML_TYPE_IQ4_NL_8_8 = 38,
|
396
|
-
LM_GGML_TYPE_COUNT,
|
398
|
+
LM_GGML_TYPE_COUNT = 39,
|
397
399
|
};
|
398
400
|
|
399
401
|
// precision
|
@@ -434,9 +436,6 @@ extern "C" {
|
|
434
436
|
LM_GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
435
437
|
LM_GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
436
438
|
LM_GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
437
|
-
LM_GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
438
|
-
LM_GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
439
|
-
LM_GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
440
439
|
};
|
441
440
|
|
442
441
|
// available tensor operations:
|
@@ -500,6 +499,7 @@ extern "C" {
|
|
500
499
|
LM_GGML_OP_POOL_2D_BACK,
|
501
500
|
LM_GGML_OP_UPSCALE, // nearest interpolate
|
502
501
|
LM_GGML_OP_PAD,
|
502
|
+
LM_GGML_OP_PAD_REFLECT_1D,
|
503
503
|
LM_GGML_OP_ARANGE,
|
504
504
|
LM_GGML_OP_TIMESTEP_EMBEDDING,
|
505
505
|
LM_GGML_OP_ARGSORT,
|
@@ -1446,6 +1446,22 @@ extern "C" {
|
|
1446
1446
|
float beta_fast,
|
1447
1447
|
float beta_slow);
|
1448
1448
|
|
1449
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_multi(
|
1450
|
+
struct lm_ggml_context * ctx,
|
1451
|
+
struct lm_ggml_tensor * a,
|
1452
|
+
struct lm_ggml_tensor * b,
|
1453
|
+
struct lm_ggml_tensor * c,
|
1454
|
+
int n_dims,
|
1455
|
+
int sections[4],
|
1456
|
+
int mode,
|
1457
|
+
int n_ctx_orig,
|
1458
|
+
float freq_base,
|
1459
|
+
float freq_scale,
|
1460
|
+
float ext_factor,
|
1461
|
+
float attn_factor,
|
1462
|
+
float beta_fast,
|
1463
|
+
float beta_slow);
|
1464
|
+
|
1449
1465
|
// in-place, returns view(a)
|
1450
1466
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext_inplace(
|
1451
1467
|
struct lm_ggml_context * ctx,
|
@@ -1549,17 +1565,6 @@ extern "C" {
|
|
1549
1565
|
int d1, // dilation dimension 1
|
1550
1566
|
bool is_2D);
|
1551
1567
|
|
1552
|
-
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_depthwise_2d(
|
1553
|
-
struct lm_ggml_context * ctx,
|
1554
|
-
struct lm_ggml_tensor * a, // convolution kernel
|
1555
|
-
struct lm_ggml_tensor * b, // data
|
1556
|
-
int s0, // stride dimension 0
|
1557
|
-
int s1, // stride dimension 1
|
1558
|
-
int p0, // padding dimension 0
|
1559
|
-
int p1, // padding dimension 1
|
1560
|
-
int d0, // dilation dimension 0
|
1561
|
-
int d1); // dilation dimension 1
|
1562
|
-
|
1563
1568
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
|
1564
1569
|
struct lm_ggml_context * ctx,
|
1565
1570
|
struct lm_ggml_tensor * a, // convolution kernel
|
@@ -1577,6 +1582,23 @@ extern "C" {
|
|
1577
1582
|
int s, // stride
|
1578
1583
|
int d); // dilation
|
1579
1584
|
|
1585
|
+
// depthwise
|
1586
|
+
// TODO: this is very likely wrong for some cases! - needs more testing
|
1587
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d_dw(
|
1588
|
+
struct lm_ggml_context * ctx,
|
1589
|
+
struct lm_ggml_tensor * a, // convolution kernel
|
1590
|
+
struct lm_ggml_tensor * b, // data
|
1591
|
+
int s0, // stride
|
1592
|
+
int p0, // padding
|
1593
|
+
int d0); // dilation
|
1594
|
+
|
1595
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d_dw_ph(
|
1596
|
+
struct lm_ggml_context * ctx,
|
1597
|
+
struct lm_ggml_tensor * a, // convolution kernel
|
1598
|
+
struct lm_ggml_tensor * b, // data
|
1599
|
+
int s0, // stride
|
1600
|
+
int d0); // dilation
|
1601
|
+
|
1580
1602
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
|
1581
1603
|
struct lm_ggml_context * ctx,
|
1582
1604
|
struct lm_ggml_tensor * a, // convolution kernel
|
@@ -1596,7 +1618,6 @@ extern "C" {
|
|
1596
1618
|
int d0, // dilation dimension 0
|
1597
1619
|
int d1); // dilation dimension 1
|
1598
1620
|
|
1599
|
-
|
1600
1621
|
// kernel size is a->ne[0] x a->ne[1]
|
1601
1622
|
// stride is equal to kernel size
|
1602
1623
|
// padding is zero
|
@@ -1623,6 +1644,18 @@ extern "C" {
|
|
1623
1644
|
struct lm_ggml_tensor * a,
|
1624
1645
|
struct lm_ggml_tensor * b);
|
1625
1646
|
|
1647
|
+
// depthwise
|
1648
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw(
|
1649
|
+
struct lm_ggml_context * ctx,
|
1650
|
+
struct lm_ggml_tensor * a, // convolution kernel
|
1651
|
+
struct lm_ggml_tensor * b, // data
|
1652
|
+
int s0, // stride dimension 0
|
1653
|
+
int s1, // stride dimension 1
|
1654
|
+
int p0, // padding dimension 0
|
1655
|
+
int p1, // padding dimension 1
|
1656
|
+
int d0, // dilation dimension 0
|
1657
|
+
int d1); // dilation dimension 1
|
1658
|
+
|
1626
1659
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0(
|
1627
1660
|
struct lm_ggml_context * ctx,
|
1628
1661
|
struct lm_ggml_tensor * a,
|
@@ -1696,6 +1729,13 @@ extern "C" {
|
|
1696
1729
|
int p2,
|
1697
1730
|
int p3);
|
1698
1731
|
|
1732
|
+
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
1733
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
|
1734
|
+
struct lm_ggml_context * ctx,
|
1735
|
+
struct lm_ggml_tensor * a,
|
1736
|
+
int p0,
|
1737
|
+
int p1);
|
1738
|
+
|
1699
1739
|
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
1700
1740
|
// timesteps: [N,]
|
1701
1741
|
// return: [N, dim]
|
@@ -2198,11 +2238,19 @@ extern "C" {
|
|
2198
2238
|
LM_GGML_API size_t lm_gguf_get_meta_size(const struct lm_gguf_context * ctx);
|
2199
2239
|
LM_GGML_API void lm_gguf_get_meta_data(const struct lm_gguf_context * ctx, void * data);
|
2200
2240
|
|
2201
|
-
#ifdef
|
2202
|
-
// restrict not standard in C++
|
2203
|
-
#
|
2241
|
+
#ifdef __cplusplus
|
2242
|
+
// restrict not standard in C++
|
2243
|
+
# if defined(__GNUC__)
|
2244
|
+
# define LM_GGML_RESTRICT __restrict__
|
2245
|
+
# elif defined(__clang__)
|
2246
|
+
# define LM_GGML_RESTRICT __restrict
|
2247
|
+
# elif defined(_MSC_VER)
|
2248
|
+
# define LM_GGML_RESTRICT __restrict
|
2249
|
+
# else
|
2250
|
+
# define LM_GGML_RESTRICT
|
2251
|
+
# endif
|
2204
2252
|
#else
|
2205
|
-
#define LM_GGML_RESTRICT restrict
|
2253
|
+
# define LM_GGML_RESTRICT restrict
|
2206
2254
|
#endif
|
2207
2255
|
typedef void (*lm_ggml_to_float_t) (const void * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int64_t k);
|
2208
2256
|
typedef void (*lm_ggml_from_float_t)(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
package/cpp/llama-grammar.cpp
CHANGED
@@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
|
822
822
|
return grammar->stacks;
|
823
823
|
}
|
824
824
|
|
825
|
-
void llama_grammar_accept(
|
826
|
-
|
827
|
-
|
828
|
-
const uint32_t chr,
|
829
|
-
llama_grammar_stacks & stacks_new) {
|
830
|
-
stacks_new.clear();
|
831
|
-
stacks_new.reserve(stacks.size());
|
825
|
+
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
826
|
+
llama_grammar_stacks stacks_new;
|
827
|
+
stacks_new.reserve(grammar->stacks.size());
|
832
828
|
|
833
|
-
for (const auto & stack : stacks) {
|
829
|
+
for (const auto & stack : grammar->stacks) {
|
834
830
|
if (stack.empty()) {
|
835
831
|
continue;
|
836
832
|
}
|
@@ -844,9 +840,11 @@ void llama_grammar_accept(
|
|
844
840
|
if (!llama_grammar_is_end_of_sequence(pos)) {
|
845
841
|
new_stack.push_back(pos);
|
846
842
|
}
|
847
|
-
llama_grammar_advance_stack(rules, new_stack, stacks_new);
|
843
|
+
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
848
844
|
}
|
849
845
|
}
|
846
|
+
|
847
|
+
grammar->stacks = std::move(stacks_new);
|
850
848
|
}
|
851
849
|
|
852
850
|
llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
@@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
|
1051
1049
|
}
|
1052
1050
|
|
1053
1051
|
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
|
1054
|
-
llama_grammar * result = new llama_grammar {
|
1052
|
+
llama_grammar * result = new llama_grammar {
|
1053
|
+
grammar.vocab,
|
1054
|
+
grammar.rules,
|
1055
|
+
grammar.stacks,
|
1056
|
+
grammar.partial_utf8,
|
1057
|
+
};
|
1055
1058
|
|
1056
1059
|
// redirect elements in stacks to point to new rules
|
1057
1060
|
for (size_t is = 0; is < result->stacks.size(); is++) {
|
@@ -1059,7 +1062,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|
1059
1062
|
for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
|
1060
1063
|
for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
|
1061
1064
|
if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
|
1062
|
-
|
1065
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
1063
1066
|
}
|
1064
1067
|
}
|
1065
1068
|
}
|
@@ -1126,11 +1129,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
1126
1129
|
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
1127
1130
|
const auto & code_points = decoded.first;
|
1128
1131
|
|
1129
|
-
llama_grammar_stacks stacks_new;
|
1130
|
-
|
1131
1132
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
1132
|
-
llama_grammar_accept(grammar
|
1133
|
-
grammar.stacks = std::move(stacks_new);
|
1133
|
+
llama_grammar_accept(&grammar, *it);
|
1134
1134
|
}
|
1135
1135
|
|
1136
1136
|
grammar.partial_utf8 = decoded.second;
|
package/cpp/llama-grammar.h
CHANGED
@@ -58,6 +58,7 @@ using llama_grammar_rules = std::vector<llama_grammar_rule>;
|
|
58
58
|
using llama_grammar_stacks = std::vector<llama_grammar_stack>;
|
59
59
|
using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
|
60
60
|
|
61
|
+
// TODO: remove, needed for tests atm
|
61
62
|
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
|
62
63
|
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
|
63
64
|
|
@@ -65,11 +66,7 @@ const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar
|
|
65
66
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
66
67
|
// produces the N possible stacks if the given char is accepted at those
|
67
68
|
// positions
|
68
|
-
void llama_grammar_accept(
|
69
|
-
const llama_grammar_rules & rules,
|
70
|
-
const llama_grammar_stacks & stacks,
|
71
|
-
uint32_t chr,
|
72
|
-
llama_grammar_stacks & stacks_new);
|
69
|
+
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
|
73
70
|
|
74
71
|
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
75
72
|
const llama_grammar_rules & rules,
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -1397,19 +1397,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
|
|
1397
1397
|
// penalties
|
1398
1398
|
|
1399
1399
|
struct llama_sampler_penalties {
|
1400
|
-
const int32_t n_vocab;
|
1401
|
-
const llama_token special_eos_id;
|
1402
|
-
const llama_token linefeed_id;
|
1403
|
-
|
1404
1400
|
const int32_t penalty_last_n;
|
1405
1401
|
const float penalty_repeat;
|
1406
1402
|
const float penalty_freq;
|
1407
1403
|
const float penalty_present;
|
1408
1404
|
|
1409
|
-
const bool penalize_nl;
|
1410
|
-
const bool ignore_eos;
|
1411
|
-
|
1412
1405
|
ring_buffer<llama_token> prev;
|
1406
|
+
|
1407
|
+
// a frequency map to count token occurrences
|
1408
|
+
std::unordered_map<llama_token, int> token_count;
|
1413
1409
|
};
|
1414
1410
|
|
1415
1411
|
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
|
@@ -1422,76 +1418,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
|
|
1422
1418
|
return;
|
1423
1419
|
}
|
1424
1420
|
|
1425
|
-
ctx->
|
1426
|
-
}
|
1427
|
-
|
1428
|
-
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
1429
|
-
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
1421
|
+
ctx->token_count[token]++;
|
1430
1422
|
|
1431
|
-
if
|
1432
|
-
|
1423
|
+
// if the ring buffer is full, remove the oldest token
|
1424
|
+
if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
|
1425
|
+
const auto old = ctx->prev.front();
|
1433
1426
|
|
1434
|
-
|
1435
|
-
if (
|
1436
|
-
|
1437
|
-
} else {
|
1438
|
-
// else, search for the special EOS token
|
1439
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
1440
|
-
if (cur_p->data[i].id == ctx->special_eos_id) {
|
1441
|
-
cur_p->data[i].logit = -INFINITY;
|
1442
|
-
break;
|
1443
|
-
}
|
1444
|
-
}
|
1427
|
+
ctx->token_count[old]--;
|
1428
|
+
if (ctx->token_count[old] == 0) {
|
1429
|
+
ctx->token_count.erase(old);
|
1445
1430
|
}
|
1446
1431
|
}
|
1447
1432
|
|
1448
|
-
|
1449
|
-
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
1450
|
-
return;
|
1451
|
-
}
|
1452
|
-
|
1453
|
-
bool nl_found = false;
|
1454
|
-
size_t nl_idx = 0;
|
1455
|
-
float nl_logit = -INFINITY;
|
1456
|
-
if (!ctx->penalize_nl) {
|
1457
|
-
assert(ctx->linefeed_id >= 0);
|
1433
|
+
ctx->prev.push_back(token);
|
1458
1434
|
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
} else {
|
1465
|
-
// else, search for the linefeed token
|
1466
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
1467
|
-
if (cur_p->data[i].id == ctx->linefeed_id) {
|
1468
|
-
nl_found = true;
|
1469
|
-
nl_idx = i;
|
1470
|
-
nl_logit = cur_p->data[i].logit;
|
1471
|
-
break;
|
1472
|
-
}
|
1473
|
-
}
|
1474
|
-
}
|
1435
|
+
#if 0
|
1436
|
+
// sanity check
|
1437
|
+
std::unordered_map<llama_token, int> tmp;
|
1438
|
+
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
1439
|
+
tmp[ctx->prev.rat(i)]++;
|
1475
1440
|
}
|
1476
1441
|
|
1477
|
-
|
1478
|
-
|
1479
|
-
|
1480
|
-
|
1442
|
+
assert(ctx->token_count == tmp);
|
1443
|
+
#endif
|
1444
|
+
}
|
1445
|
+
|
1446
|
+
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
1447
|
+
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
1481
1448
|
|
1482
|
-
|
1483
|
-
|
1449
|
+
if ((ctx->penalty_last_n == 0) ||
|
1450
|
+
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
1451
|
+
return;
|
1484
1452
|
}
|
1485
1453
|
|
1486
1454
|
// Apply frequency and presence penalties to the cur_p
|
1487
1455
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
1488
|
-
const auto token_iter = token_count.find(cur_p->data[i].id);
|
1489
|
-
if (token_iter == token_count.end()) {
|
1456
|
+
const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
|
1457
|
+
if (token_iter == ctx->token_count.end()) {
|
1490
1458
|
continue;
|
1491
1459
|
}
|
1492
1460
|
|
1493
1461
|
const int count = token_iter->second;
|
1494
1462
|
|
1463
|
+
assert(count > 0 && count <= ctx->penalty_last_n);
|
1464
|
+
|
1495
1465
|
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
1496
1466
|
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
1497
1467
|
if (cur_p->data[i].logit <= 0) {
|
@@ -1504,30 +1474,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
|
|
1504
1474
|
}
|
1505
1475
|
|
1506
1476
|
cur_p->sorted = false;
|
1507
|
-
|
1508
|
-
if (!ctx->penalize_nl && nl_found) {
|
1509
|
-
// restore the logit of the newline token if it was penalized
|
1510
|
-
cur_p->data[nl_idx].logit = nl_logit;
|
1511
|
-
}
|
1512
1477
|
}
|
1513
1478
|
|
1514
1479
|
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
|
1515
1480
|
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
1516
1481
|
ctx->prev.clear();
|
1482
|
+
ctx->token_count.clear();
|
1517
1483
|
}
|
1518
1484
|
|
1519
1485
|
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
|
1520
1486
|
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
|
1521
1487
|
auto * result = llama_sampler_init_penalties(
|
1522
|
-
ctx->n_vocab,
|
1523
|
-
ctx->special_eos_id,
|
1524
|
-
ctx->linefeed_id,
|
1525
1488
|
ctx->penalty_last_n,
|
1526
1489
|
ctx->penalty_repeat,
|
1527
1490
|
ctx->penalty_freq,
|
1528
|
-
ctx->penalty_present
|
1529
|
-
ctx->penalize_nl,
|
1530
|
-
ctx->ignore_eos);
|
1491
|
+
ctx->penalty_present);
|
1531
1492
|
|
1532
1493
|
// copy the state
|
1533
1494
|
{
|
@@ -1553,38 +1514,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
|
|
1553
1514
|
};
|
1554
1515
|
|
1555
1516
|
struct llama_sampler * llama_sampler_init_penalties(
|
1556
|
-
int32_t n_vocab,
|
1557
|
-
llama_token special_eos_id,
|
1558
|
-
llama_token linefeed_id,
|
1559
1517
|
int32_t penalty_last_n,
|
1560
1518
|
float penalty_repeat,
|
1561
1519
|
float penalty_freq,
|
1562
|
-
float penalty_present
|
1563
|
-
bool penalize_nl,
|
1564
|
-
bool ignore_eos) {
|
1565
|
-
if (linefeed_id == LLAMA_TOKEN_NULL) {
|
1566
|
-
penalize_nl = true;
|
1567
|
-
}
|
1568
|
-
|
1569
|
-
if (special_eos_id == LLAMA_TOKEN_NULL) {
|
1570
|
-
ignore_eos = false;
|
1571
|
-
}
|
1572
|
-
|
1520
|
+
float penalty_present) {
|
1573
1521
|
penalty_last_n = std::max(penalty_last_n, 0);
|
1574
1522
|
|
1575
1523
|
return new llama_sampler {
|
1576
1524
|
/* .iface = */ &llama_sampler_penalties_i,
|
1577
1525
|
/* .ctx = */ new llama_sampler_penalties {
|
1578
|
-
/* .n_vocab = */ n_vocab,
|
1579
|
-
/* .special_eos_id = */ special_eos_id,
|
1580
|
-
/* .linefeed_id = */ linefeed_id,
|
1581
1526
|
/* .penalty_last_n = */ penalty_last_n,
|
1582
1527
|
/* .penalty_repeat = */ penalty_repeat,
|
1583
1528
|
/* .penalty_freq = */ penalty_freq,
|
1584
1529
|
/* .penalty_present = */ penalty_present,
|
1585
|
-
/* .penalize_nl = */ penalize_nl,
|
1586
|
-
/* .ignore_eos = */ ignore_eos,
|
1587
1530
|
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
1531
|
+
/* .token_count = */ {},
|
1588
1532
|
},
|
1589
1533
|
};
|
1590
1534
|
}
|
@@ -1612,7 +1556,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
|
1612
1556
|
if (word.find(str) != std::string::npos) {
|
1613
1557
|
token_sequences.emplace(token_id, std::vector<llama_token>());
|
1614
1558
|
} else {
|
1615
|
-
size_t word_len = word.size()
|
1559
|
+
size_t word_len = word.size();
|
1560
|
+
size_t str_len = str.size();
|
1616
1561
|
size_t pos = -1;
|
1617
1562
|
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
|
1618
1563
|
bool match = true;
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
418
418
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
419
419
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
420
420
|
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
421
|
+
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
|
421
422
|
regex_exprs = {
|
422
423
|
"\\p{N}",
|
423
424
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
|
|
737
738
|
std::vector<std::string> words(1, "");
|
738
739
|
|
739
740
|
for (const uint32_t cpt : cpts_nfd) {
|
740
|
-
const auto flags =
|
741
|
+
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
741
742
|
|
742
743
|
if (flags.is_whitespace) {
|
743
744
|
if (words.back().size()) { // finish previous word if any
|
@@ -1656,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
|
|
1656
1657
|
}
|
1657
1658
|
|
1658
1659
|
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
|
1659
|
-
return vocab.special_bos_id;
|
1660
|
+
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
|
1660
1661
|
}
|
1661
1662
|
|
1662
1663
|
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
|
@@ -1866,6 +1867,10 @@ int32_t llama_detokenize_impl(
|
|
1866
1867
|
int32_t text_len_max,
|
1867
1868
|
bool remove_special,
|
1868
1869
|
bool unparse_special) {
|
1870
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
|
1871
|
+
return 0;
|
1872
|
+
}
|
1873
|
+
|
1869
1874
|
LM_GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
1870
1875
|
|
1871
1876
|
int32_t avail = text_len_max;
|
package/cpp/llama-vocab.h
CHANGED
@@ -45,7 +45,7 @@ struct llama_vocab {
|
|
45
45
|
id special_unk_id = 0;
|
46
46
|
id special_sep_id = LLAMA_TOKEN_NULL;
|
47
47
|
id special_pad_id = LLAMA_TOKEN_NULL;
|
48
|
-
id special_cls_id = LLAMA_TOKEN_NULL;
|
48
|
+
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
49
49
|
id special_mask_id = LLAMA_TOKEN_NULL;
|
50
50
|
|
51
51
|
id linefeed_id = 13;
|