cui-llama.rn 1.4.6 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +9 -2
- package/android/src/main/jni.cpp +52 -34
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/binary-ops.cpp +158 -0
- package/cpp/binary-ops.h +16 -0
- package/cpp/chat.cpp +1769 -1779
- package/cpp/chat.h +9 -1
- package/cpp/common.cpp +20 -522
- package/cpp/common.h +13 -36
- package/cpp/cpu-common.h +72 -0
- package/cpp/ggml-common.h +12 -6
- package/cpp/ggml-cpu-aarch64.cpp +1557 -80
- package/cpp/ggml-cpu-impl.h +2 -21
- package/cpp/ggml-cpu-quants.c +904 -405
- package/cpp/ggml-cpu.c +909 -13237
- package/cpp/ggml-impl.h +50 -23
- package/cpp/ggml-metal-impl.h +77 -3
- package/cpp/ggml-metal.m +794 -580
- package/cpp/ggml.c +92 -3
- package/cpp/ggml.h +29 -5
- package/cpp/gguf.cpp +1 -0
- package/cpp/llama-adapter.cpp +55 -20
- package/cpp/llama-adapter.h +11 -9
- package/cpp/llama-arch.cpp +217 -16
- package/cpp/llama-arch.h +25 -0
- package/cpp/llama-batch.h +2 -2
- package/cpp/llama-chat.cpp +54 -2
- package/cpp/llama-chat.h +3 -0
- package/cpp/llama-context.cpp +2294 -1238
- package/cpp/llama-context.h +214 -77
- package/cpp/llama-cparams.h +1 -0
- package/cpp/llama-graph.cpp +1695 -0
- package/cpp/llama-graph.h +592 -0
- package/cpp/llama-hparams.cpp +8 -0
- package/cpp/llama-hparams.h +17 -0
- package/cpp/llama-io.cpp +15 -0
- package/cpp/llama-io.h +35 -0
- package/cpp/llama-kv-cache.cpp +965 -303
- package/cpp/llama-kv-cache.h +145 -151
- package/cpp/llama-memory.cpp +1 -0
- package/cpp/llama-memory.h +21 -0
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +10 -5
- package/cpp/llama-model-loader.h +5 -3
- package/cpp/llama-model.cpp +9194 -201
- package/cpp/llama-model.h +40 -1
- package/cpp/llama-sampling.cpp +5 -0
- package/cpp/llama-vocab.cpp +36 -5
- package/cpp/llama.cpp +51 -9984
- package/cpp/llama.h +102 -22
- package/cpp/log.cpp +34 -0
- package/cpp/minja/chat-template.hpp +15 -7
- package/cpp/minja/minja.hpp +120 -94
- package/cpp/ops.cpp +8723 -0
- package/cpp/ops.h +128 -0
- package/cpp/rn-llama.cpp +44 -53
- package/cpp/rn-llama.h +2 -12
- package/cpp/sampling.cpp +3 -0
- package/cpp/sgemm.cpp +533 -88
- package/cpp/simd-mappings.h +888 -0
- package/cpp/speculative.cpp +4 -4
- package/cpp/unary-ops.cpp +186 -0
- package/cpp/unary-ops.h +28 -0
- package/cpp/vec.cpp +258 -0
- package/cpp/vec.h +802 -0
- package/ios/CMakeLists.txt +5 -2
- package/ios/RNLlama.mm +2 -2
- package/ios/RNLlamaContext.mm +40 -24
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +6 -4
- package/src/index.ts +3 -1
- package/cpp/chat-template.hpp +0 -529
- package/cpp/minja.hpp +0 -2915
package/cpp/llama-model.h
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
|
3
3
|
#include "llama.h"
|
4
4
|
#include "llama-arch.h"
|
5
|
+
#include "llama-graph.h"
|
5
6
|
#include "llama-hparams.h"
|
7
|
+
#include "llama-memory.h"
|
6
8
|
#include "llama-vocab.h"
|
7
9
|
|
8
10
|
#include <memory>
|
@@ -10,6 +12,8 @@
|
|
10
12
|
#include <unordered_map>
|
11
13
|
#include <vector>
|
12
14
|
|
15
|
+
struct llama_cparams;
|
16
|
+
struct llama_ubatch;
|
13
17
|
struct llama_model_loader;
|
14
18
|
|
15
19
|
// available models
|
@@ -25,6 +29,7 @@ enum llm_type {
|
|
25
29
|
LLM_TYPE_109M,
|
26
30
|
LLM_TYPE_137M,
|
27
31
|
LLM_TYPE_160M,
|
32
|
+
LLM_TYPE_190M,
|
28
33
|
LLM_TYPE_220M,
|
29
34
|
LLM_TYPE_250M,
|
30
35
|
LLM_TYPE_270M,
|
@@ -39,8 +44,10 @@ enum llm_type {
|
|
39
44
|
LLM_TYPE_1_4B,
|
40
45
|
LLM_TYPE_1_5B,
|
41
46
|
LLM_TYPE_1_6B,
|
47
|
+
LLM_TYPE_1_8B,
|
42
48
|
LLM_TYPE_2B,
|
43
49
|
LLM_TYPE_2_8B,
|
50
|
+
LLM_TYPE_2_9B,
|
44
51
|
LLM_TYPE_3B,
|
45
52
|
LLM_TYPE_4B,
|
46
53
|
LLM_TYPE_6B,
|
@@ -78,6 +85,9 @@ enum llm_type {
|
|
78
85
|
LLM_TYPE_10B_128x3_66B,
|
79
86
|
LLM_TYPE_57B_A14B,
|
80
87
|
LLM_TYPE_27B,
|
88
|
+
LLM_TYPE_290B,
|
89
|
+
LLM_TYPE_17B_16E, // llama4 Scout
|
90
|
+
LLM_TYPE_17B_128E, // llama4 Maverick
|
81
91
|
};
|
82
92
|
|
83
93
|
struct llama_layer_posnet {
|
@@ -256,6 +266,20 @@ struct llama_layer {
|
|
256
266
|
struct lm_ggml_tensor * time_mix_receptance_b = nullptr;
|
257
267
|
struct lm_ggml_tensor * time_mix_gate = nullptr;
|
258
268
|
|
269
|
+
// rwkv7
|
270
|
+
struct lm_ggml_tensor * time_mix_w0 = nullptr;
|
271
|
+
struct lm_ggml_tensor * time_mix_a0 = nullptr;
|
272
|
+
struct lm_ggml_tensor * time_mix_a1 = nullptr;
|
273
|
+
struct lm_ggml_tensor * time_mix_a2 = nullptr;
|
274
|
+
struct lm_ggml_tensor * time_mix_v0 = nullptr;
|
275
|
+
struct lm_ggml_tensor * time_mix_v1 = nullptr;
|
276
|
+
struct lm_ggml_tensor * time_mix_v2 = nullptr;
|
277
|
+
struct lm_ggml_tensor * time_mix_g1 = nullptr;
|
278
|
+
struct lm_ggml_tensor * time_mix_g2 = nullptr;
|
279
|
+
struct lm_ggml_tensor * time_mix_k_k = nullptr;
|
280
|
+
struct lm_ggml_tensor * time_mix_k_a = nullptr;
|
281
|
+
struct lm_ggml_tensor * time_mix_r_k = nullptr;
|
282
|
+
|
259
283
|
struct lm_ggml_tensor * time_mix_ln = nullptr;
|
260
284
|
struct lm_ggml_tensor * time_mix_ln_b = nullptr;
|
261
285
|
struct lm_ggml_tensor * time_mix_output = nullptr;
|
@@ -347,7 +371,7 @@ struct llama_model {
|
|
347
371
|
std::string desc() const;
|
348
372
|
|
349
373
|
size_t size() const;
|
350
|
-
size_t
|
374
|
+
size_t n_tensors() const;
|
351
375
|
size_t n_devices() const;
|
352
376
|
|
353
377
|
// total number of parameters in the model
|
@@ -360,11 +384,26 @@ struct llama_model {
|
|
360
384
|
|
361
385
|
lm_ggml_backend_buffer_type_t select_buft(int il) const;
|
362
386
|
|
387
|
+
bool has_tensor_overrides() const;
|
388
|
+
|
363
389
|
const struct lm_ggml_tensor * get_tensor(const char * name) const;
|
364
390
|
|
391
|
+
// TODO: move this to new llm_arch_model_i interface
|
392
|
+
llama_memory_i * create_memory() const; // TODO: params
|
393
|
+
|
394
|
+
// TODO: move this to new llm_arch_model_i interface
|
395
|
+
llm_graph_result_ptr build_graph(
|
396
|
+
const llm_graph_params & params,
|
397
|
+
lm_ggml_cgraph * gf,
|
398
|
+
llm_graph_type type) const;
|
399
|
+
|
365
400
|
private:
|
366
401
|
struct impl;
|
367
402
|
std::unique_ptr<impl> pimpl;
|
368
403
|
};
|
369
404
|
|
370
405
|
const char * llm_type_name(llm_type type);
|
406
|
+
|
407
|
+
// For internal test use
|
408
|
+
// TODO: remove
|
409
|
+
const std::vector<std::pair<std::string, lm_ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -1478,6 +1478,7 @@ static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sam
|
|
1478
1478
|
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
1479
1479
|
|
1480
1480
|
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
|
1481
|
+
LM_GGML_ASSERT(result);
|
1481
1482
|
|
1482
1483
|
// copy the state
|
1483
1484
|
{
|
@@ -1549,6 +1550,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
1549
1550
|
/* .grammar_root = */ grammar_root,
|
1550
1551
|
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
1551
1552
|
};
|
1553
|
+
if (!ctx->grammar) {
|
1554
|
+
delete ctx;
|
1555
|
+
return nullptr;
|
1556
|
+
}
|
1552
1557
|
} else {
|
1553
1558
|
*ctx = {
|
1554
1559
|
/* .vocab = */ vocab,
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -342,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
342
342
|
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
343
343
|
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
344
344
|
case LLAMA_VOCAB_PRE_TYPE_JAIS:
|
345
|
+
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
|
345
346
|
regex_exprs = {
|
346
347
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
347
348
|
};
|
@@ -400,6 +401,20 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
400
401
|
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
401
402
|
};
|
402
403
|
break;
|
404
|
+
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
|
405
|
+
regex_exprs = {
|
406
|
+
"\\p{N}+",
|
407
|
+
"(?=(\\d{3})+(?!\\d))",
|
408
|
+
};
|
409
|
+
break;
|
410
|
+
case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
|
411
|
+
regex_exprs = {
|
412
|
+
// original regex from tokenizer.json
|
413
|
+
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
414
|
+
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
|
415
|
+
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
416
|
+
};
|
417
|
+
break;
|
403
418
|
default:
|
404
419
|
// default regex for BPE tokenization pre-processing
|
405
420
|
regex_exprs = {
|
@@ -1601,9 +1616,22 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1601
1616
|
tokenizer_pre == "megrez") {
|
1602
1617
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
1603
1618
|
} else if (
|
1604
|
-
|
1619
|
+
tokenizer_pre == "gpt-4o" ||
|
1620
|
+
tokenizer_pre == "llama4") {
|
1605
1621
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
1606
1622
|
clean_spaces = false;
|
1623
|
+
} else if (
|
1624
|
+
tokenizer_pre == "superbpe") {
|
1625
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
|
1626
|
+
clean_spaces = false;
|
1627
|
+
} else if (
|
1628
|
+
tokenizer_pre == "trillion") {
|
1629
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
1630
|
+
clean_spaces = false;
|
1631
|
+
} else if (
|
1632
|
+
tokenizer_pre == "bailingmoe") {
|
1633
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
1634
|
+
clean_spaces = false;
|
1607
1635
|
} else {
|
1608
1636
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
1609
1637
|
}
|
@@ -1781,6 +1809,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1781
1809
|
|| t.first == "<end_of_turn>"
|
1782
1810
|
|| t.first == "<|endoftext|>"
|
1783
1811
|
|| t.first == "<EOT>"
|
1812
|
+
|| t.first == "_<EOT>"
|
1784
1813
|
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
1785
1814
|
) {
|
1786
1815
|
special_eot_id = t.second;
|
@@ -1813,6 +1842,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1813
1842
|
|| t.first == "<fim-prefix>"
|
1814
1843
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
1815
1844
|
|| t.first == "<PRE>"
|
1845
|
+
|| t.first == "▁<PRE>" // CodeLlama
|
1816
1846
|
) {
|
1817
1847
|
special_fim_pre_id = t.second;
|
1818
1848
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
@@ -1830,6 +1860,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1830
1860
|
|| t.first == "<fim-suffix>"
|
1831
1861
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
1832
1862
|
|| t.first == "<SUF>"
|
1863
|
+
|| t.first == "▁<SUF>" // CodeLlama
|
1833
1864
|
) {
|
1834
1865
|
special_fim_suf_id = t.second;
|
1835
1866
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
@@ -1847,6 +1878,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1847
1878
|
|| t.first == "<fim-middle>"
|
1848
1879
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
1849
1880
|
|| t.first == "<MID>"
|
1881
|
+
|| t.first == "▁<MID>" // CodeLlama
|
1850
1882
|
) {
|
1851
1883
|
special_fim_mid_id = t.second;
|
1852
1884
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
@@ -1931,6 +1963,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1931
1963
|
|| t.first == "<|endoftext|>"
|
1932
1964
|
|| t.first == "<|eom_id|>"
|
1933
1965
|
|| t.first == "<EOT>"
|
1966
|
+
|| t.first == "_<EOT>"
|
1934
1967
|
) {
|
1935
1968
|
special_eog_ids.insert(t.second);
|
1936
1969
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
@@ -2189,14 +2222,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
|
|
2189
2222
|
// find the first occurrence of a given special token in this fragment
|
2190
2223
|
// passing offset argument only limit the "search area" but match coordinates
|
2191
2224
|
// are still relative to the source full raw_text
|
2192
|
-
|
2225
|
+
// string_view begins at pos 0 for the same reason
|
2226
|
+
auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
|
2193
2227
|
|
2194
2228
|
// no occurrences found, stop processing this fragment for a given special token
|
2195
2229
|
if (match == std::string::npos) break;
|
2196
2230
|
|
2197
|
-
// check if match is within bounds of offset <-> length
|
2198
|
-
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
|
2199
|
-
|
2200
2231
|
#ifdef PRETOKENIZERDEBUG
|
2201
2232
|
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
2202
2233
|
#endif
|