@novastera-oss/llamarn 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +12 -8
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +46 -65
- package/cpp/LlamaCppModel.h +5 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
- package/cpp/llama.cpp/common/arg.cpp +8 -6
- package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
- package/cpp/llama.cpp/common/chat-parser.h +2 -1
- package/cpp/llama.cpp/common/chat.cpp +4 -4
- package/cpp/llama.cpp/common/common.cpp +2 -0
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/include/llama.h +12 -8
- package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
- package/cpp/llama.cpp/src/llama-batch.h +15 -10
- package/cpp/llama.cpp/src/llama-context.cpp +226 -151
- package/cpp/llama.cpp/src/llama-context.h +25 -8
- package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
- package/cpp/llama.cpp/src/llama-graph.h +25 -24
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
- package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
- package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
- package/cpp/llama.cpp/src/llama-memory.h +44 -0
- package/cpp/llama.cpp/src/llama-model.cpp +23 -16
- package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +101 -52
- package/cpp/rn-utils.hpp +8 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +12 -8
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +22 -22
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -5,7 +5,10 @@
|
|
|
5
5
|
#include "llama-batch.h"
|
|
6
6
|
#include "llama-cparams.h"
|
|
7
7
|
#include "llama-model-loader.h"
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
#include "llama-kv-cache-unified.h"
|
|
10
|
+
#include "llama-kv-cache-unified-iswa.h"
|
|
11
|
+
#include "llama-kv-cache-recurrent.h"
|
|
9
12
|
|
|
10
13
|
#include "ggml-cpp.h"
|
|
11
14
|
|
|
@@ -8892,9 +8895,9 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8892
8895
|
ggml_tensor * state_mask,
|
|
8893
8896
|
const llama_ubatch & ubatch,
|
|
8894
8897
|
int il) const {
|
|
8895
|
-
const
|
|
8898
|
+
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
|
8896
8899
|
|
|
8897
|
-
const auto kv_head =
|
|
8900
|
+
const auto kv_head = kv_state->get_head();
|
|
8898
8901
|
|
|
8899
8902
|
const int64_t d_conv = hparams.ssm_d_conv;
|
|
8900
8903
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
@@ -8912,8 +8915,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
8912
8915
|
GGML_ASSERT(ubatch.equal_seqs);
|
|
8913
8916
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
8914
8917
|
|
|
8915
|
-
ggml_tensor * conv_states_all =
|
|
8916
|
-
ggml_tensor * ssm_states_all =
|
|
8918
|
+
ggml_tensor * conv_states_all = kv_state->get_k_l(il);
|
|
8919
|
+
ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
|
|
8917
8920
|
|
|
8918
8921
|
// (ab)using the KV cache to store the states
|
|
8919
8922
|
ggml_tensor * conv = build_copy_mask_state(
|
|
@@ -11640,7 +11643,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11640
11643
|
ggml_tensor * state_mask,
|
|
11641
11644
|
const llama_ubatch & ubatch,
|
|
11642
11645
|
int il) const {
|
|
11643
|
-
const
|
|
11646
|
+
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
|
11644
11647
|
|
|
11645
11648
|
const auto n_tokens = ubatch.n_tokens;
|
|
11646
11649
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -11650,7 +11653,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11650
11653
|
const auto n_head = n_embd / head_size;
|
|
11651
11654
|
const auto n_head_kv = hparams.n_head_kv(il);
|
|
11652
11655
|
|
|
11653
|
-
const auto kv_head =
|
|
11656
|
+
const auto kv_head = kv_state->get_head();
|
|
11654
11657
|
|
|
11655
11658
|
const auto & layer = model.layers[il];
|
|
11656
11659
|
|
|
@@ -11762,7 +11765,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11762
11765
|
}
|
|
11763
11766
|
|
|
11764
11767
|
ggml_tensor * wkv_state = build_copy_mask_state(
|
|
11765
|
-
gf,
|
|
11768
|
+
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
|
11766
11769
|
hparams.n_embd_v_s(), n_seqs);
|
|
11767
11770
|
|
|
11768
11771
|
ggml_tensor * wkv_output;
|
|
@@ -11781,9 +11784,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
11781
11784
|
wkv_state,
|
|
11782
11785
|
ggml_view_1d(
|
|
11783
11786
|
ctx0,
|
|
11784
|
-
|
|
11787
|
+
kv_state->get_v_l(il),
|
|
11785
11788
|
hparams.n_embd_v_s() * n_seqs,
|
|
11786
|
-
hparams.n_embd_v_s() * kv_head * ggml_element_size(
|
|
11789
|
+
hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
|
|
11787
11790
|
)
|
|
11788
11791
|
)
|
|
11789
11792
|
);
|
|
@@ -12036,7 +12039,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12036
12039
|
ggml_tensor *& first_layer_value,
|
|
12037
12040
|
const llama_ubatch & ubatch,
|
|
12038
12041
|
int il) const {
|
|
12039
|
-
const
|
|
12042
|
+
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
|
12040
12043
|
|
|
12041
12044
|
const auto n_tokens = ubatch.n_tokens;
|
|
12042
12045
|
const auto n_seqs = ubatch.n_seqs;
|
|
@@ -12045,7 +12048,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12045
12048
|
const auto head_count = n_embd / head_size;
|
|
12046
12049
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
12047
12050
|
|
|
12048
|
-
const auto kv_head =
|
|
12051
|
+
const auto kv_head = kv_state->get_head();
|
|
12049
12052
|
|
|
12050
12053
|
const auto & layer = model.layers[il];
|
|
12051
12054
|
|
|
@@ -12116,7 +12119,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12116
12119
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
|
12117
12120
|
|
|
12118
12121
|
ggml_tensor * wkv_state = build_copy_mask_state(
|
|
12119
|
-
gf,
|
|
12122
|
+
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
|
12120
12123
|
hparams.n_embd_v_s(), n_seqs);
|
|
12121
12124
|
|
|
12122
12125
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
|
@@ -12130,9 +12133,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
12130
12133
|
wkv_state,
|
|
12131
12134
|
ggml_view_1d(
|
|
12132
12135
|
ctx0,
|
|
12133
|
-
|
|
12136
|
+
kv_state->get_v_l(il),
|
|
12134
12137
|
hparams.n_embd_v_s() * n_seqs,
|
|
12135
|
-
hparams.n_embd_v_s() * kv_head * ggml_element_size(
|
|
12138
|
+
hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
|
|
12136
12139
|
)
|
|
12137
12140
|
)
|
|
12138
12141
|
);
|
|
@@ -13230,7 +13233,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
13230
13233
|
params.swa_full,
|
|
13231
13234
|
cparams.n_ctx,
|
|
13232
13235
|
cparams.n_seq_max,
|
|
13233
|
-
cparams.
|
|
13236
|
+
cparams.n_ubatch,
|
|
13234
13237
|
padding);
|
|
13235
13238
|
} else {
|
|
13236
13239
|
GGML_ASSERT(!hparams.is_swa_any());
|
|
@@ -13593,6 +13596,10 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
|
|
|
13593
13596
|
return model->hparams.n_head_kv();
|
|
13594
13597
|
}
|
|
13595
13598
|
|
|
13599
|
+
int32_t llama_model_n_swa(const llama_model * model) {
|
|
13600
|
+
return model->hparams.n_swa;
|
|
13601
|
+
}
|
|
13602
|
+
|
|
13596
13603
|
// deprecated
|
|
13597
13604
|
int32_t llama_n_ctx_train(const llama_model * model) {
|
|
13598
13605
|
return llama_model_n_ctx_train(model);
|
|
@@ -2080,9 +2080,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2080
2080
|
|
|
2081
2081
|
std::string model_name;
|
|
2082
2082
|
std::string tokenizer_pre;
|
|
2083
|
+
std::string general_arch;
|
|
2083
2084
|
|
|
2084
2085
|
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
|
2085
2086
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
2087
|
+
ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
|
|
2086
2088
|
|
|
2087
2089
|
// model name to lowercase
|
|
2088
2090
|
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
|
@@ -2091,8 +2093,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2091
2093
|
}
|
|
2092
2094
|
);
|
|
2093
2095
|
|
|
2094
|
-
// set attributes by model/tokenizer name
|
|
2095
|
-
if (
|
|
2096
|
+
// set attributes by model/tokenizer/architecture name
|
|
2097
|
+
if (false
|
|
2098
|
+
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
|
2099
|
+
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
|
2100
|
+
) {
|
|
2096
2101
|
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
2097
2102
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
2098
2103
|
for (auto id : cache_special_tokens) {
|