@novastera-oss/llamarn 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +12 -8
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +46 -65
  13. package/cpp/LlamaCppModel.h +5 -0
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/README.md +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
  17. package/cpp/llama.cpp/common/arg.cpp +8 -6
  18. package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
  19. package/cpp/llama.cpp/common/chat-parser.h +2 -1
  20. package/cpp/llama.cpp/common/chat.cpp +4 -4
  21. package/cpp/llama.cpp/common/common.cpp +2 -0
  22. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  23. package/cpp/llama.cpp/common/json-partial.h +2 -1
  24. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  25. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
  27. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  28. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  30. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
  32. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
  35. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  39. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
  41. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  42. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  43. package/cpp/llama.cpp/include/llama.h +12 -8
  44. package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
  45. package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
  46. package/cpp/llama.cpp/src/llama-batch.h +15 -10
  47. package/cpp/llama.cpp/src/llama-context.cpp +226 -151
  48. package/cpp/llama.cpp/src/llama-context.h +25 -8
  49. package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
  50. package/cpp/llama.cpp/src/llama-graph.h +25 -24
  51. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
  52. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
  53. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
  54. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
  55. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
  56. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
  57. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
  58. package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
  59. package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
  60. package/cpp/llama.cpp/src/llama-memory.h +44 -0
  61. package/cpp/llama.cpp/src/llama-model.cpp +23 -16
  62. package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
  63. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  64. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  65. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  66. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  67. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  68. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  69. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  70. package/cpp/rn-completion.cpp +101 -52
  71. package/cpp/rn-utils.hpp +8 -1
  72. package/ios/include/common/minja/chat-template.hpp +1 -1
  73. package/ios/include/common/minja/minja.hpp +1 -1
  74. package/ios/include/json-schema-to-grammar.h +4 -4
  75. package/ios/include/llama.h +12 -8
  76. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  77. package/ios/libs/llama.xcframework/Info.plist +22 -22
  78. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  79. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
  80. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  81. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
  82. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  83. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  84. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  85. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
  86. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  87. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  88. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  89. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  90. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  91. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
  92. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  93. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
  94. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  95. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
  96. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  97. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  98. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
  99. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  100. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  101. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  102. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
  103. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  104. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
  105. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  106. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  107. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
  108. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
  109. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  110. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  111. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  112. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  113. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
  114. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  115. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
  116. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  117. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  118. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
  119. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
  120. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  121. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  122. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  123. package/package.json +1 -1
@@ -5,7 +5,10 @@
5
5
  #include "llama-batch.h"
6
6
  #include "llama-cparams.h"
7
7
  #include "llama-model-loader.h"
8
- #include "llama-kv-cache.h"
8
+
9
+ #include "llama-kv-cache-unified.h"
10
+ #include "llama-kv-cache-unified-iswa.h"
11
+ #include "llama-kv-cache-recurrent.h"
9
12
 
10
13
  #include "ggml-cpp.h"
11
14
 
@@ -8892,9 +8895,9 @@ struct llm_build_mamba : public llm_graph_context {
8892
8895
  ggml_tensor * state_mask,
8893
8896
  const llama_ubatch & ubatch,
8894
8897
  int il) const {
8895
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
8898
+ const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
8896
8899
 
8897
- const auto kv_head = kv_self->head;
8900
+ const auto kv_head = kv_state->get_head();
8898
8901
 
8899
8902
  const int64_t d_conv = hparams.ssm_d_conv;
8900
8903
  const int64_t d_inner = hparams.ssm_d_inner;
@@ -8912,8 +8915,8 @@ struct llm_build_mamba : public llm_graph_context {
8912
8915
  GGML_ASSERT(ubatch.equal_seqs);
8913
8916
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
8914
8917
 
8915
- ggml_tensor * conv_states_all = kv_self->k_l[il];
8916
- ggml_tensor * ssm_states_all = kv_self->v_l[il];
8918
+ ggml_tensor * conv_states_all = kv_state->get_k_l(il);
8919
+ ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
8917
8920
 
8918
8921
  // (ab)using the KV cache to store the states
8919
8922
  ggml_tensor * conv = build_copy_mask_state(
@@ -11640,7 +11643,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11640
11643
  ggml_tensor * state_mask,
11641
11644
  const llama_ubatch & ubatch,
11642
11645
  int il) const {
11643
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
11646
+ const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
11644
11647
 
11645
11648
  const auto n_tokens = ubatch.n_tokens;
11646
11649
  const auto n_seqs = ubatch.n_seqs;
@@ -11650,7 +11653,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11650
11653
  const auto n_head = n_embd / head_size;
11651
11654
  const auto n_head_kv = hparams.n_head_kv(il);
11652
11655
 
11653
- const auto kv_head = kv_self->head;
11656
+ const auto kv_head = kv_state->get_head();
11654
11657
 
11655
11658
  const auto & layer = model.layers[il];
11656
11659
 
@@ -11762,7 +11765,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11762
11765
  }
11763
11766
 
11764
11767
  ggml_tensor * wkv_state = build_copy_mask_state(
11765
- gf, kv_self->v_l[il], state_copy, state_mask,
11768
+ gf, kv_state->get_v_l(il), state_copy, state_mask,
11766
11769
  hparams.n_embd_v_s(), n_seqs);
11767
11770
 
11768
11771
  ggml_tensor * wkv_output;
@@ -11781,9 +11784,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11781
11784
  wkv_state,
11782
11785
  ggml_view_1d(
11783
11786
  ctx0,
11784
- kv_self->v_l[il],
11787
+ kv_state->get_v_l(il),
11785
11788
  hparams.n_embd_v_s() * n_seqs,
11786
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
11789
+ hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
11787
11790
  )
11788
11791
  )
11789
11792
  );
@@ -12036,7 +12039,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12036
12039
  ggml_tensor *& first_layer_value,
12037
12040
  const llama_ubatch & ubatch,
12038
12041
  int il) const {
12039
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
12042
+ const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
12040
12043
 
12041
12044
  const auto n_tokens = ubatch.n_tokens;
12042
12045
  const auto n_seqs = ubatch.n_seqs;
@@ -12045,7 +12048,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12045
12048
  const auto head_count = n_embd / head_size;
12046
12049
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12047
12050
 
12048
- const auto kv_head = kv_self->head;
12051
+ const auto kv_head = kv_state->get_head();
12049
12052
 
12050
12053
  const auto & layer = model.layers[il];
12051
12054
 
@@ -12116,7 +12119,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12116
12119
  a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
12117
12120
 
12118
12121
  ggml_tensor * wkv_state = build_copy_mask_state(
12119
- gf, kv_self->v_l[il], state_copy, state_mask,
12122
+ gf, kv_state->get_v_l(il), state_copy, state_mask,
12120
12123
  hparams.n_embd_v_s(), n_seqs);
12121
12124
 
12122
12125
  ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
@@ -12130,9 +12133,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12130
12133
  wkv_state,
12131
12134
  ggml_view_1d(
12132
12135
  ctx0,
12133
- kv_self->v_l[il],
12136
+ kv_state->get_v_l(il),
12134
12137
  hparams.n_embd_v_s() * n_seqs,
12135
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
12138
+ hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
12136
12139
  )
12137
12140
  )
12138
12141
  );
@@ -13230,7 +13233,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
13230
13233
  params.swa_full,
13231
13234
  cparams.n_ctx,
13232
13235
  cparams.n_seq_max,
13233
- cparams.n_batch,
13236
+ cparams.n_ubatch,
13234
13237
  padding);
13235
13238
  } else {
13236
13239
  GGML_ASSERT(!hparams.is_swa_any());
@@ -13593,6 +13596,10 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
13593
13596
  return model->hparams.n_head_kv();
13594
13597
  }
13595
13598
 
13599
+ int32_t llama_model_n_swa(const llama_model * model) {
13600
+ return model->hparams.n_swa;
13601
+ }
13602
+
13596
13603
  // deprecated
13597
13604
  int32_t llama_n_ctx_train(const llama_model * model) {
13598
13605
  return llama_model_n_ctx_train(model);
@@ -2080,9 +2080,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2080
2080
 
2081
2081
  std::string model_name;
2082
2082
  std::string tokenizer_pre;
2083
+ std::string general_arch;
2083
2084
 
2084
2085
  ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
2085
2086
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2087
+ ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
2086
2088
 
2087
2089
  // model name to lowercase
2088
2090
  std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@@ -2091,8 +2093,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2091
2093
  }
2092
2094
  );
2093
2095
 
2094
- // set attributes by model/tokenizer name
2095
- if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
2096
+ // set attributes by model/tokenizer/architecture name
2097
+ if (false
2098
+ || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2099
+ || _contains_any(general_arch, {"nomic-bert-moe"})
2100
+ ) {
2096
2101
  _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2097
2102
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2098
2103
  for (auto id : cache_special_tokens) {